diff --git a/umbra/dutch_mmetaphone.py b/umbra/dutch_mmetaphone.py index 6a3d1cbadbebb5d8afb577dd7d1c4cbb600e170b..caa5dc53e3cb0efb03a3305f02881481853460c0 100644 --- a/umbra/dutch_mmetaphone.py +++ b/umbra/dutch_mmetaphone.py @@ -213,8 +213,6 @@ class DutchPhonetics: source = source[:amb_pos] + "+" + source[amb_pos + 1:] amb_pos = source.find(amb) - print(source, representations) - return source, representations @staticmethod diff --git a/umbra/mistake_finder.py b/umbra/mistake_finder.py index f90ce08e48ec227ad435b9e73edf919fee759ee1..90ac984de2a947b19cd92e25189221cb30062f74 100644 --- a/umbra/mistake_finder.py +++ b/umbra/mistake_finder.py @@ -5,7 +5,7 @@ from dutch_mmetaphone import DutchPhonetics class MistakeFinder: """Finds the mistakes in already aligned lists of Words""" - def __init__(self, seman_checker): + def __init__(self, seman_checker, form_checker): """ Constructor @@ -13,6 +13,7 @@ class MistakeFinder: seman_checker: Instance of the SemanticChecker class """ self._seman_checker = seman_checker + self._form_checker = form_checker self._source = None self._shadow = None @@ -38,7 +39,9 @@ class MistakeFinder: # PRINT FOR TEST OF SO FAR IMPLEMENTED MISTAKE CHECKS: for word in self._shadow: - if not word.correct and word.mistake != Mistake.RANDOM: + if word.mistake == Mistake.REPETITION or word.mistake\ + == Mistake.SEMANTIC or word.mistake == Mistake.FORM or\ + word.mistake == Mistake.PHONETIC: print(str(word) + " " + str(word.mistake)) def _determine_mistake(self, index, word): @@ -50,12 +53,13 @@ class MistakeFinder: word: Instance of the Words class """ - if not self._check_repetition(word, index): # If not a repetition, - # if not self._check_form_mistake(word): # form, - if not self._check_semantic_mistake(word, index): # semantic, - if not self._check_phonetic_mistake(word, index): # phon; + # Check for all the types of mistakes, and if it is none of them ... + if not self._check_repetition(word, index): + if not self._check_form_mistake(word, index): + if not self._check_semantic_mistake(word, index): + if not self._check_phonetic_mistake(word, index): # ... then shadow word is random: - word.mistake = Mistake.RANDOM + word.mistake = Mistake.RANDOM def _check_phonetic_mistake(self, shd_word, index): """Checks if shd_word is a phonetic mistake or not. If yes, it flags @@ -79,10 +83,12 @@ class MistakeFinder: not phonetic_mistake and src_index < ( len(self._source) - 1): # Crude fix, but removes error src_word = self._source[src_index] - if DutchPhonetics.compare(src_word.word, shd_word.word): + phonetic_mistake = self.phonetically_related(src_word, shd_word) + if phonetic_mistake: shd_word.mistake = Mistake.PHONETIC - src_word.mistake = Mistake.PHONETIC - phonetic_mistake = True + shd_word.source = src_word + if not src_word.shadowed and src_word.mistake is None: + src_word.mistake = Mistake.PHONETIC src_index += 1 return phonetic_mistake @@ -105,16 +111,38 @@ class MistakeFinder: src_index = self._source.index( self._shadow[shd_anchor_index].anchor) while self._source[src_index].get_difference(shd_word) > 0 and\ - not semantic_mistake and src_index < ( - len(self._source)-1): # Crude fix, but removes error + not semantic_mistake and src_index < (len(self._source)-1): src_word = self._source[src_index] - if self.semantically_related(src_word, shd_word): + semantic_mistake = self.semantically_related(src_word, shd_word) + if semantic_mistake: shd_word.mistake = Mistake.SEMANTIC - src_word.mistake = Mistake.SEMANTIC - semantic_mistake = True + shd_word.source = src_word + if not src_word.shadowed and src_word.mistake is None: + src_word.mistake = Mistake.SEMANTIC src_index += 1 return semantic_mistake + def _check_form_mistake(self, shd_word, index): + if shd_word.word == "te": + print("te") + form_mistake = False + last_shd_index = self._shadow.find_last_matched_shadow(index) + if last_shd_index < 0: + src_index = 0 + else: + src_index = self._source.index(self._shadow[last_shd_index].source) + while self._source[src_index].get_difference(shd_word) > 0 and\ + not form_mistake and src_index < len(self._source) - 1: + src_word = self._source[src_index] + form_mistake = self.form_related(src_word, shd_word) + if form_mistake: + shd_word.mistake = Mistake.FORM + shd_word.source = src_word + if not src_word.shadowed and src_word.mistake is None: + src_word.mistake = Mistake.FORM + src_index += 1 + return form_mistake + def _check_repetition(self, word, index): """ Check whether a word can be seen as a repetition mistake @@ -170,8 +198,7 @@ class MistakeFinder: return chain def semantically_related(self, src_word, shd_word): - """Checks if src_word and shd_word are semantically related. If yes, it - flags both as semantic mistakes. + """Checks if src_word and shd_word are semantically related. Args: src_word: SourceWord instance @@ -183,10 +210,40 @@ class MistakeFinder: shd_string = shd_word.word src_string = src_word.word related = self._seman_checker.semantically_related(src_string, - shd_string) - if related: - shd_word.mistake = Mistake.SEMANTIC - src_word.mistake = Mistake.SEMANTIC + shd_string)\ + and shd_string != src_string + return related + + def phonetically_related(self, src_word, shd_word): + """Checks if src_word and shd_word are phonetically related. + + Args: + src_word: SourceWord instance + shd_word: ShadowWord instance + Returns: + related: True if src_word and shd_word are phonetically related, + False if not. + """ + shd_string = shd_word.word + src_string = src_word.word + related = DutchPhonetics.compare(src_string, shd_string)\ + and shd_string != src_string + return related + + def form_related(self, src_word, shd_word): + """Checks if src_word and shd_word are related in form. + + Args: + src_word: SourceWord instance + shd_word: ShadowWord instance + Returns: + related: True if src_word and shd_word are related in form, + False if not. + """ + shd_string = shd_word.word + src_string = src_word.word + related = self._form_checker.form_related(src_string, shd_string) and\ + shd_string != src_string return related def print_for_nw(self, source, shadow): diff --git a/umbra/statistics.py b/umbra/statistics.py index df0f04bfb20af725e2b8fd4d48f71c7ebfe7b43d..ef135a88089c90a9a9760404ae00ca7bd0e6c92e 100644 --- a/umbra/statistics.py +++ b/umbra/statistics.py @@ -20,11 +20,12 @@ class Statistics: # currently the case, so the two operate seperately in this class. # _parser and _seman_checker should be moved to MistakeFinder when # possible - self.path = ut.get_path("OpenDutchWordnet/resources/odwn/odwn_orbn_gwg-LMF_1.3.xml.gz") - + self.path = ut.get_path( + "OpenDutchWordnet/resources/odwn/odwn_orbn_gwg-LMF_1.3.xml.gz") self._seman_checker = SemanticChecker() self._form_checker = FormChecker() - self._mistake_finder = MistakeFinder(self._seman_checker) + self._mistake_finder = MistakeFinder(self._seman_checker, + self._form_checker) self._mistake_counter = MistakeCounter() @property diff --git a/umbra/words.py b/umbra/words.py index 9db5f7cd0118d79447ac7fd37c3494575068ddef..f2dc49ee80c514b3a3bbb0dec94a4d04a9b934f0 100644 --- a/umbra/words.py +++ b/umbra/words.py @@ -231,6 +231,25 @@ class Sentence(list): return index index -= 1 return -1 + + def find_last_matched_shadow(self, index): + """Find the index of the last shadowed word. + + Args: + index: the index before which should be sought for a shadow word + that is matched with a source word. + + Returns: + last_index: the index of the previous matched shadow. This is -1 + if there is no matches shadow before or on the specified index, or + if the index was out of bounds. + """ + if 0 <= index < len(self): + while index >= 0: + if self[index].has_source(): + return index + index -= 1 + return index def __str__(self): return ' '.join([word.word for word in self])