diff --git a/umbra/anchor_algorithm.py b/umbra/anchor_algorithm.py index fb2417f503c2981a6c609fa73eaa040ea1f17f90..282cec7dbac8c9cc00fef9e034cc9e740b468b63 100644 --- a/umbra/anchor_algorithm.py +++ b/umbra/anchor_algorithm.py @@ -43,6 +43,13 @@ class AnchorAlgorithm(AlignmentStrategy): shd_index) shd_last_anchor = self._shadow.index(word.anchor) + 1 src_last_anchor = src_index + 1 + # If the last word is not an anchor, then still search for matches: + elif src_index == len(self._source)-1: + src_index += 1 + shd_index = len(self._shadow) + self._search_between_anchors(src_last_anchor, + shd_last_anchor, src_index, + shd_index) def _search_between_anchors(self, src_start, shd_start, src_end, shd_end): """Search for shadowed words between two anchored pairs of words. @@ -74,7 +81,7 @@ class AnchorAlgorithm(AlignmentStrategy): competing_over = None for shadow_index in range(shd_start, shd_end): shd_word = self._shadow[shadow_index] - if src_word.__eq__(shd_word) and not found: + if src_word.word == shd_word.word and not found: if 0.05 < src_word.get_difference(shd_word) < 3.0: if not shd_word.correct: found = True diff --git a/umbra/mistake_finder.py b/umbra/mistake_finder.py index f6296c1581a9334abc6d353e5fe2fc7068b8e90e..8ee6013efc2d599cf5e93d90ba0c9d8deb2fd026 100644 --- a/umbra/mistake_finder.py +++ b/umbra/mistake_finder.py @@ -5,24 +5,71 @@ from mistake_enum import Mistake class MistakeFinder: """Finds the mistakes in already aligned lists of Words""" - @staticmethod + def __init__(self): + self._source = None + self._shadow = None + def start(self, source, shadow): """Find all the mistakes and classify them.""" + self._source = source + self._shadow = shadow # Loop over the shadow: - for index, word in enumerate(shadow): - if not self._check_repetition(word, index): # If not a repetition, - if not self._check_semantic_mistake(word): # semantic, - if not self._check_phonentic_mistake(word): # or phonetic, - # ... then shadow word is random: - word.mistake = Mistake.RANDOM + for index, word in enumerate(self._shadow): + if not word.correct and word.mistake is None: + self._determine_mistake(index, word) # Loop over the source: - for word in source: + for word in self._source: if not word.shadowed and word.mistake is None: # If not yet marked as mistake, then it is skipped: word.mistake = Mistake.SKIPPED + for word in self._shadow: + if word.mistake == Mistake.REPETITION: + print(str(word) + " " + str(word.mistake)) + + def _determine_mistake(self, index, word): + if not self._check_repetition(word, index): # If not a repetition, + # if not self._check_semantic_mistake(word): # semantic, + # if not self._check_phonentic_mistake(word): # or phonetic, + # ... then shadow word is random: + word.mistake = Mistake.RANDOM def _check_repetition(self, word, index): - pass + assert 0 <= index < len(self._shadow) + found = self._check_pre_repetition(word, index) + if not found: + if index < len(self._shadow)-1 and self._shadow[index+1].word.find( + word.word) == 0: + word.mistake = Mistake.REPETITION + return True + return found + + def _check_pre_repetition(self, word, index): + found_before = False + stop = False + diff = 0 + i = index + + while not found_before and not stop and i > 0: + i -= 1 + stop = self._shadow[i].is_anchor() + found_at = self._shadow[i].word.find(word.word) + if found_at >= 0 and found_at == len(self._shadow[i].word)-\ + len(word.word): + found_before = True + + chain = found_before + while chain and index-i > 1: + diff += 1 + i += 1 + chain = self._shadow[i].word == self._shadow[index+diff].word + + if chain: + for j in range(index, index+diff+1): + self._shadow[j].mistake = Mistake.REPETITION + else: + return False + return True + # TODO: WordNet integration. Function below should be usable already! def _check_semantic_mistake(self, shd_word): diff --git a/umbra/statistics.py b/umbra/statistics.py index 4560d54fc9567acfab147daef35a3ea960bf7fd9..9c68e2df5788e3f7b61ae6b6ac096ed38a6b79f4 100644 --- a/umbra/statistics.py +++ b/umbra/statistics.py @@ -1,4 +1,5 @@ from saa_algorithm import SaaAlgorithm +import copy from saa_Romeo import SaaRomeo from anchor_algorithm import AnchorAlgorithm from mistake_finder import MistakeFinder @@ -27,6 +28,9 @@ class Statistics: source: the words in the source file shadow: the words in the shadow file """ + # Make a deepcopy such that the testing is equal for both strategies: + source_em = copy.deepcopy(source) + shadow_em = copy.deepcopy(shadow) # Alignment 0 print('Romeo') @@ -37,23 +41,20 @@ class Statistics: print(f'source: {s_word.source} shadow: {s_word}') correctness = self._strategy.correctly_shadowed(source) - # Reset the is_shadowed property - for word in source: - word.shadowed = False - - # The other strategies have as of yet not been adapted - # Alignment 1 (Thijs) + # Alignment 1 (Thijs) (yet to be implemented) # Alignment 2 print('\n Emma') self._strategy = AnchorAlgorithm() - source_align_em, shadow_align_em = self._strategy.align(source, shadow) + source_align_em, shadow_align_em = self._strategy.align(source_em, + shadow_em) for s_word in shadow_align_em: if s_word.has_source(): print(f'source: {s_word.source} shadow: {s_word}') - correctness = self._strategy.correctly_shadowed(source) + correctness = self._strategy.correctly_shadowed(source_em) # TODO: Make the mistake finding work with the statement below - # MistakeFinder.start(source_align_em, shadow_align_em) - - return source_align, shadow_align, correctness + finder = MistakeFinder() + finder.start(source_align_em, shadow_align_em) + + return source_align_em, shadow_align_em, correctness diff --git a/umbra/words.py b/umbra/words.py index 9a477d07d4926234e4e0bb86f23a24f10fd77b17..aa25e59fce07406a0781ea673170478e435c67c6 100644 --- a/umbra/words.py +++ b/umbra/words.py @@ -92,7 +92,11 @@ class Word: Args: anchor: Word instance to anchor to """ - self._anchor = anchor + self._set_anchor(anchor) + + def _set_anchor(self, anchor): + """Anchor setter. Has to be overridden in the subclass.""" + raise NotImplementedError def get_difference(self, other): """Get the difference between the onset of this word and the other. @@ -162,7 +166,11 @@ class ShadowWord(Word): """ assert self._correct is False assert mistake != Mistake.SKIPPED - self._mistake(mistake) + self._mistake = mistake + + def _set_anchor(self, anchor): + self._anchor = anchor + self._source = anchor class SourceWord(Word): @@ -195,7 +203,10 @@ class SourceWord(Word): mistake: Mistake Enum """ assert mistake != Mistake.RANDOM - self._mistake(mistake) + self._mistake = mistake + + def _set_anchor(self, anchor): + self._anchor = anchor class Sentence(list):