Merge branch 'SC-113/implement-repetition-mistakes'

9a1156c1 · Alfen, T. van (Tanja) · 7def4f12 · 67653870 · 9a1156c1 · 9a1156c1
Commit 9a1156c1 authored 5 years ago by Alfen, T. van (Tanja)
--- a/umbra/anchor_algorithm.py
+++ b/umbra/anchor_algorithm.py
@@ -43,6 +43,13 @@ class AnchorAlgorithm(AlignmentStrategy):
                                                 shd_index)
                shd_last_anchor = self._shadow.index(word.anchor) + 1
                src_last_anchor = src_index + 1
+            # If the last word is not an anchor, then still search for matches:
+            elif src_index == len(self._source)-1:
+                src_index += 1
+                shd_index = len(self._shadow)
+                self._search_between_anchors(src_last_anchor,
+                                             shd_last_anchor, src_index,
+                                             shd_index)
    def _search_between_anchors(self, src_start, shd_start, src_end, shd_end):
        """Search for shadowed words between two anchored pairs of words.
@@ -74,7 +81,7 @@ class AnchorAlgorithm(AlignmentStrategy):
        competing_over = None
        for shadow_index in range(shd_start, shd_end):
            shd_word = self._shadow[shadow_index]
-            if src_word.__eq__(shd_word) and not found:
+            if src_word.word == shd_word.word and not found:
                if 0.05 < src_word.get_difference(shd_word) < 3.0:
                    if not shd_word.correct:
                        found = True

--- a/umbra/mistake_finder.py
+++ b/umbra/mistake_finder.py
@@ -5,24 +5,71 @@ from mistake_enum import Mistake
 class MistakeFinder:
    """Finds the mistakes in already aligned lists of Words"""
-    @staticmethod
+    def __init__(self):
+        self._source = None
+        self._shadow = None
    def start(self, source, shadow):
        """Find all the mistakes and classify them."""
+        self._source = source
+        self._shadow = shadow
        # Loop over the shadow:
-        for index, word in enumerate(shadow):
+        for index, word in enumerate(self._shadow):
-            if not self._check_repetition(word, index):  # If not a repetition,
+            if not word.correct and word.mistake is None:
-                if not self._check_semantic_mistake(word):  # semantic,
+                self._determine_mistake(index, word)
-                    if not self._check_phonentic_mistake(word):  # or phonetic,
-                        # ... then shadow word is random:
-                        word.mistake = Mistake.RANDOM
        # Loop over the source:
-        for word in source:
+        for word in self._source:
            if not word.shadowed and word.mistake is None:
                # If not yet marked as mistake, then it is skipped:
                word.mistake = Mistake.SKIPPED
+        for word in self._shadow:
+            if word.mistake == Mistake.REPETITION:
+                print(str(word) + " " + str(word.mistake))
+    def _determine_mistake(self, index, word):
+        if not self._check_repetition(word, index):  # If not a repetition,
+            # if not self._check_semantic_mistake(word):  # semantic,
+                # if not self._check_phonentic_mistake(word):  # or phonetic,
+                    # ... then shadow word is random:
+                    word.mistake = Mistake.RANDOM
    def _check_repetition(self, word, index):
-        pass
+        assert 0 <= index < len(self._shadow)
+        found = self._check_pre_repetition(word, index)
+        if not found:
+            if index < len(self._shadow)-1 and self._shadow[index+1].word.find(
+                    word.word) == 0:
+                word.mistake = Mistake.REPETITION
+                return True
+        return found
+    def _check_pre_repetition(self, word, index):
+        found_before = False
+        stop = False
+        diff = 0
+        i = index
+        while not found_before and not stop and i > 0:
+            i -= 1
+            stop = self._shadow[i].is_anchor()
+            found_at = self._shadow[i].word.find(word.word)
+            if found_at >= 0 and found_at == len(self._shadow[i].word)-\
+                    len(word.word):
+                found_before = True
+        chain = found_before
+        while chain and index-i > 1:
+            diff += 1
+            i += 1
+            chain = self._shadow[i].word == self._shadow[index+diff].word
+        if chain:
+            for j in range(index, index+diff+1):
+                self._shadow[j].mistake = Mistake.REPETITION
+        else:
+            return False
+        return True
    # TODO: WordNet integration. Function below should be usable already!
    def _check_semantic_mistake(self, shd_word):

--- a/umbra/statistics.py
+++ b/umbra/statistics.py
 from saa_algorithm import SaaAlgorithm
+import copy
 from saa_Romeo import SaaRomeo
 from anchor_algorithm import AnchorAlgorithm
 from mistake_finder import MistakeFinder
@@ -27,6 +28,9 @@ class Statistics:
            source: the words in the source file
            shadow: the words in the shadow file
        """
+        # Make a deepcopy such that the testing is equal for both strategies:
+        source_em = copy.deepcopy(source)
+        shadow_em = copy.deepcopy(shadow)
        # Alignment 0
        print('Romeo')
@@ -37,23 +41,20 @@ class Statistics:
                print(f'source: {s_word.source} shadow: {s_word}')
        correctness = self._strategy.correctly_shadowed(source)
-        # Reset the is_shadowed property
+        # Alignment 1 (Thijs) (yet to be implemented)
-        for word in source:
-            word.shadowed = False
-        # The other strategies have as of yet not been adapted
-        # Alignment 1 (Thijs)
        # Alignment 2
        print('\n Emma')
        self._strategy = AnchorAlgorithm()
-        source_align_em, shadow_align_em = self._strategy.align(source, shadow)
+        source_align_em, shadow_align_em = self._strategy.align(source_em,
+                                                                shadow_em)
        for s_word in shadow_align_em:
            if s_word.has_source():
                print(f'source: {s_word.source} shadow: {s_word}')
-        correctness = self._strategy.correctly_shadowed(source)
+        correctness = self._strategy.correctly_shadowed(source_em)
        # TODO: Make the mistake finding work with the statement below
-        # MistakeFinder.start(source_align_em, shadow_align_em)
+        finder = MistakeFinder()
+        finder.start(source_align_em, shadow_align_em)
-        return source_align, shadow_align, correctness
+        return source_align_em, shadow_align_em, correctness
--- a/umbra/words.py
+++ b/umbra/words.py
@@ -92,7 +92,11 @@ class Word:
        Args:
             anchor: Word instance to anchor to
        """
-        self._anchor = anchor
+        self._set_anchor(anchor)
+    def _set_anchor(self, anchor):
+        """Anchor setter. Has to be overridden in the subclass."""
+        raise NotImplementedError
    def get_difference(self, other):
        """Get the difference between the onset of this word and the other.
@@ -162,7 +166,11 @@ class ShadowWord(Word):
        """
        assert self._correct is False
        assert mistake != Mistake.SKIPPED
-        self._mistake(mistake)
+        self._mistake = mistake
+    def _set_anchor(self, anchor):
+        self._anchor = anchor
+        self._source = anchor
 class SourceWord(Word):
@@ -195,7 +203,10 @@ class SourceWord(Word):
            mistake: Mistake Enum
        """
        assert mistake != Mistake.RANDOM
-        self._mistake(mistake)
+        self._mistake = mistake
+    def _set_anchor(self, anchor):
+        self._anchor = anchor
 class Sentence(list):