From 8ec11dd5e2419cffdbddb7720892b3e61953f16b Mon Sep 17 00:00:00 2001
From: "Vriezen, E.C. (Emma)" <e.vriezen@student.ru.nl>
Date: Sun, 1 Dec 2019 20:03:50 +0100
Subject: [PATCH] The repetition checking works now (in mistake_finder.py). Is
 run after the anchor algorithm.

---
 documentation/MistakeDefinitions.txt |  1 +
 umbra/filereader.py                  |  2 +-
 umbra/mistake_finder.py              | 54 +++++++++++++++++++++++-----
 umbra/words.py                       | 15 ++++++--
 4 files changed, 60 insertions(+), 12 deletions(-)

diff --git a/documentation/MistakeDefinitions.txt b/documentation/MistakeDefinitions.txt
index 91c8cdc8..96319ebc 100644
--- a/documentation/MistakeDefinitions.txt
+++ b/documentation/MistakeDefinitions.txt
@@ -1,4 +1,5 @@
 Stuttering/repetition mistakes:
+(These are all implemented in mistake_finder.py)
 
 - 2 or more times the same word in a row in the shadow, while it only appears
 once in the source. Example: 'en' and 'en' in file 7 (2nd 'en' is repetition)
diff --git a/umbra/filereader.py b/umbra/filereader.py
index b35c2589..9ce3f469 100644
--- a/umbra/filereader.py
+++ b/umbra/filereader.py
@@ -162,7 +162,7 @@ class CSVWriter(FileWriter):
         info = "Total " + str(info[0]), "Shadowed " + str(info[1]), "Skipped"\
             " " + str(info[2])
         sc = []
-        for entry in source.words:
+        for entry in source:
             sc.append([entry.word, entry.onset, entry.offset, entry.shadowed])
         with open(path+'.csv', 'w') as f:
             writer = csv.writer(f)
diff --git a/umbra/mistake_finder.py b/umbra/mistake_finder.py
index 447f2282..1d8ba502 100644
--- a/umbra/mistake_finder.py
+++ b/umbra/mistake_finder.py
@@ -6,22 +6,25 @@ class MistakeFinder:
     """Finds the mistakes in already aligned lists of Words"""
 
     def __init__(self):
-        self.source = None
-        self.shadow = None
+        self._source = None
+        self._shadow = None
 
     def start(self, source, shadow):
         """Find all the mistakes and classify them."""
-        self.source = source
-        self.shadow = shadow
+        self._source = source
+        self._shadow = shadow
         # Loop over the shadow:
-        for index, word in enumerate(self.shadow):
-            if not word.correct:
+        for index, word in enumerate(self._shadow):
+            if not word.correct and word.mistake is None:
                 self._determine_mistake(index, word)
         # Loop over the source:
-        for word in self.source:
+        for word in self._source:
             if not word.shadowed and word.mistake is None:
                 # If not yet marked as mistake, then it is skipped:
                 word.mistake = Mistake.SKIPPED
+        for word in self._shadow:
+            if word.mistake == Mistake.REPETITION:
+                print(word)
 
     def _determine_mistake(self, index, word):
         if not self._check_repetition(word, index):  # If not a repetition,
@@ -31,8 +34,41 @@ class MistakeFinder:
                     word.mistake = Mistake.RANDOM
 
     def _check_repetition(self, word, index):
-        assert 0 <= index < len(self.shadow)
-        # print(index)
+        assert 0 <= index < len(self._shadow)
+        found = self._check_pre_repetition(word, index)
+        if not found:
+            if index < len(self._shadow)-1 and self._shadow[index+1].word.find(
+                    word.word) == 0:
+                word.mistake = Mistake.REPETITION
+                return True
+        return found
+
+    def _check_pre_repetition(self, word, index):
+        found_before = False
+        stop = False
+        diff = 0
+        i = index
+
+        while not found_before and not stop and i > 0:
+            i -= 1
+            stop = self._shadow[i].is_anchor()
+            found_at = self._shadow[i].word.find(word.word)
+            if found_at >= 0 and found_at == len(self._shadow[i].word)-\
+                    len(word.word):
+                found_before = True
+
+        chain = found_before
+        while chain and index-i > 1:
+            diff += 1
+            i += 1
+            chain = self._shadow[i].word == self._shadow[index+diff].word
+
+        if chain:
+            for j in range(index, index+diff+1):
+                self._shadow[j].mistake = Mistake.REPETITION
+        else:
+            return False
+        return True
 
 
     # TODO: WordNet integration. Function below should be usable already!
diff --git a/umbra/words.py b/umbra/words.py
index 689fe325..de047c62 100644
--- a/umbra/words.py
+++ b/umbra/words.py
@@ -10,7 +10,7 @@ class Word:
         self._mistake = None
 
     def __str__(self):
-        return "%s | %f | %f" % (self._word, self._onset, self._offset)
+        return "%s | %s | %f | %f" % (self._word, self._mistake, self._onset, self._offset)
 
     def __len__(self):
         return 1
@@ -92,7 +92,11 @@ class Word:
         Args:
              anchor: Word instance to anchor to
         """
-        self._anchor = anchor
+        self._set_anchor(anchor)
+
+    def _set_anchor(self, anchor):
+        """Anchor setter. Has to be overridden in the subclass."""
+        raise NotImplementedError
 
     def get_difference(self, other):
         """Get the difference between the onset of this word and the other.
@@ -164,6 +168,10 @@ class ShadowWord(Word):
         assert mistake != Mistake.SKIPPED
         self._mistake = mistake
 
+    def _set_anchor(self, anchor):
+        self._anchor = anchor
+        self._source = anchor
+
 
 class SourceWord(Word):
     def __init__(self, word, onset, offset):
@@ -197,6 +205,9 @@ class SourceWord(Word):
         assert mistake != Mistake.RANDOM
         self._mistake = mistake
 
+    def _set_anchor(self, anchor):
+        self._anchor = anchor
+
 
 class Sentence(list):
     def __init__(self, words):
-- 
GitLab