From 8ec11dd5e2419cffdbddb7720892b3e61953f16b Mon Sep 17 00:00:00 2001 From: "Vriezen, E.C. (Emma)" <e.vriezen@student.ru.nl> Date: Sun, 1 Dec 2019 20:03:50 +0100 Subject: [PATCH] The repetition checking works now (in mistake_finder.py). Is run after the anchor algorithm. --- documentation/MistakeDefinitions.txt | 1 + umbra/filereader.py | 2 +- umbra/mistake_finder.py | 54 +++++++++++++++++++++++----- umbra/words.py | 15 ++++++-- 4 files changed, 60 insertions(+), 12 deletions(-) diff --git a/documentation/MistakeDefinitions.txt b/documentation/MistakeDefinitions.txt index 91c8cdc8..96319ebc 100644 --- a/documentation/MistakeDefinitions.txt +++ b/documentation/MistakeDefinitions.txt @@ -1,4 +1,5 @@ Stuttering/repetition mistakes: +(These are all implemented in mistake_finder.py) - 2 or more times the same word in a row in the shadow, while it only appears once in the source. Example: 'en' and 'en' in file 7 (2nd 'en' is repetition) diff --git a/umbra/filereader.py b/umbra/filereader.py index b35c2589..9ce3f469 100644 --- a/umbra/filereader.py +++ b/umbra/filereader.py @@ -162,7 +162,7 @@ class CSVWriter(FileWriter): info = "Total " + str(info[0]), "Shadowed " + str(info[1]), "Skipped"\ " " + str(info[2]) sc = [] - for entry in source.words: + for entry in source: sc.append([entry.word, entry.onset, entry.offset, entry.shadowed]) with open(path+'.csv', 'w') as f: writer = csv.writer(f) diff --git a/umbra/mistake_finder.py b/umbra/mistake_finder.py index 447f2282..1d8ba502 100644 --- a/umbra/mistake_finder.py +++ b/umbra/mistake_finder.py @@ -6,22 +6,25 @@ class MistakeFinder: """Finds the mistakes in already aligned lists of Words""" def __init__(self): - self.source = None - self.shadow = None + self._source = None + self._shadow = None def start(self, source, shadow): """Find all the mistakes and classify them.""" - self.source = source - self.shadow = shadow + self._source = source + self._shadow = shadow # Loop over the shadow: - for index, word in enumerate(self.shadow): - if not word.correct: + for index, word in enumerate(self._shadow): + if not word.correct and word.mistake is None: self._determine_mistake(index, word) # Loop over the source: - for word in self.source: + for word in self._source: if not word.shadowed and word.mistake is None: # If not yet marked as mistake, then it is skipped: word.mistake = Mistake.SKIPPED + for word in self._shadow: + if word.mistake == Mistake.REPETITION: + print(word) def _determine_mistake(self, index, word): if not self._check_repetition(word, index): # If not a repetition, @@ -31,8 +34,41 @@ class MistakeFinder: word.mistake = Mistake.RANDOM def _check_repetition(self, word, index): - assert 0 <= index < len(self.shadow) - # print(index) + assert 0 <= index < len(self._shadow) + found = self._check_pre_repetition(word, index) + if not found: + if index < len(self._shadow)-1 and self._shadow[index+1].word.find( + word.word) == 0: + word.mistake = Mistake.REPETITION + return True + return found + + def _check_pre_repetition(self, word, index): + found_before = False + stop = False + diff = 0 + i = index + + while not found_before and not stop and i > 0: + i -= 1 + stop = self._shadow[i].is_anchor() + found_at = self._shadow[i].word.find(word.word) + if found_at >= 0 and found_at == len(self._shadow[i].word)-\ + len(word.word): + found_before = True + + chain = found_before + while chain and index-i > 1: + diff += 1 + i += 1 + chain = self._shadow[i].word == self._shadow[index+diff].word + + if chain: + for j in range(index, index+diff+1): + self._shadow[j].mistake = Mistake.REPETITION + else: + return False + return True # TODO: WordNet integration. Function below should be usable already! diff --git a/umbra/words.py b/umbra/words.py index 689fe325..de047c62 100644 --- a/umbra/words.py +++ b/umbra/words.py @@ -10,7 +10,7 @@ class Word: self._mistake = None def __str__(self): - return "%s | %f | %f" % (self._word, self._onset, self._offset) + return "%s | %s | %f | %f" % (self._word, self._mistake, self._onset, self._offset) def __len__(self): return 1 @@ -92,7 +92,11 @@ class Word: Args: anchor: Word instance to anchor to """ - self._anchor = anchor + self._set_anchor(anchor) + + def _set_anchor(self, anchor): + """Anchor setter. Has to be overridden in the subclass.""" + raise NotImplementedError def get_difference(self, other): """Get the difference between the onset of this word and the other. @@ -164,6 +168,10 @@ class ShadowWord(Word): assert mistake != Mistake.SKIPPED self._mistake = mistake + def _set_anchor(self, anchor): + self._anchor = anchor + self._source = anchor + class SourceWord(Word): def __init__(self, word, onset, offset): @@ -197,6 +205,9 @@ class SourceWord(Word): assert mistake != Mistake.RANDOM self._mistake = mistake + def _set_anchor(self, anchor): + self._anchor = anchor + class Sentence(list): def __init__(self, words): -- GitLab