Merge branch 'SC-143/phonetics_in_NW'

5e7429af · Alfen, T. van (Tanja) · 3cff9580 · 422bd37e · 5e7429af · 5e7429af
Commit 5e7429af authored 5 years ago by Alfen, T. van (Tanja)
--- a/umbra/mistake_finder.py
+++ b/umbra/mistake_finder.py
 from mistake_enum import Mistake
+from dutch_mmetaphone import DutchPhonetics


 class MistakeFinder:
@@ -28,7 +29,7 @@ class MistakeFinder:
        # Loop over the shadow:
        for index, word in enumerate(self._shadow):
            if not word.correct and word.mistake is None:
-                self._determine_mistake(index,word)
+                self._determine_mistake(index, word)
        # Loop over the source:
        for word in self._source:
            if not word.shadowed and word.mistake is None:
@@ -37,8 +38,7 @@ class MistakeFinder:

        # PRINT FOR TEST OF SO FAR IMPLEMENTED MISTAKE CHECKS:
        for word in self._shadow:
-            if word.mistake == Mistake.REPETITION or word.mistake\
-                    == Mistake.SEMANTIC:
+            if not word.correct and word.mistake != Mistake.RANDOM:
                print(str(word) + " " + str(word.mistake))

    def _determine_mistake(self, index, word):
@@ -52,11 +52,40 @@ class MistakeFinder:
        """
        if not self._check_repetition(word, index):  # If not a repetition,
            # if not self._check_form_mistake(word):  # form,
-                if not self._check_semantic_mistake(word,index):  # semantic,
-                    # if not self._check_phonentic_mistake(word):  # phonetic;
+                if not self._check_semantic_mistake(word, index):  # semantic,
+                    if not self._check_phonetic_mistake(word, index):  # phon;
                        # ... then shadow word is random:
                            word.mistake = Mistake.RANDOM

+    def _check_phonetic_mistake(self, shd_word, index):
+        """Checks if shd_word is a phonetic mistake or not. If yes, it flags
+        shd_word and the source word it belongs to as phonetic mistakes.
+
+        Args:
+            shd_word: the Word which is checked for semantic mistake.
+            index: index of shd_word in shadow.
+        Returns:
+            phonetic_mistake: True if shd_word is a phonetic mistake, False
+            if not.
+        """
+        phonetic_mistake = False
+        shd_anchor_index = self._shadow.find_previous_anchor(index)
+        if shd_anchor_index < 0:  # If there is no anchor found,
+            src_index = 0  # then just start at the beginning.
+        else:
+            src_index = self._source.index(
+                self._shadow[shd_anchor_index].anchor)
+        while self._source[src_index].get_difference(shd_word) > 0 and \
+                not phonetic_mistake and src_index < (
+                len(self._source) - 1):  # Crude fix, but removes error
+            src_word = self._source[src_index]
+            if DutchPhonetics.compare(src_word.word, shd_word.word):
+                shd_word.mistake = Mistake.PHONETIC
+                src_word.mistake = Mistake.PHONETIC
+                phonetic_mistake = True
+            src_index += 1
+        return phonetic_mistake
+
    def _check_semantic_mistake(self, shd_word, index):
        """Checks if shd_word is a semantic mistake or not. If yes, it flags
        shd_word and the source word it belongs to as semantic mistakes.
@@ -76,7 +105,8 @@ class MistakeFinder:
            src_index = self._source.index(
                self._shadow[shd_anchor_index].anchor)
        while self._source[src_index].get_difference(shd_word) > 0 and\
-                not semantic_mistake and src_index < (len(self._source)-1): # Crude fix, but removes error
+                not semantic_mistake and src_index < (
+                len(self._source)-1):  # Crude fix, but removes error
            src_word = self._source[src_index]
            if self.semantically_related(src_word, shd_word):
                shd_word.mistake = Mistake.SEMANTIC
@@ -100,8 +130,8 @@ class MistakeFinder:
        assert 0 <= index < len(self._shadow)
        found = self._check_pre_repetition(word, index)
        if not found:  # Check if word is the start of the next word.
-            if index < len(self._shadow)-1 and self._shadow[index+1].word.find(
-                    word.word) == 0:
+            if index < len(self._shadow)-1 and self._shadow[index+1].word.\
+                    startswith(word.word):
                word.mistake = Mistake.REPETITION
                found = True
        return found
@@ -125,9 +155,7 @@ class MistakeFinder:
        while not found_before and not stop and i > 0:
            i -= 1
            stop = self._shadow[i].is_anchor()
-            found_at = self._shadow[i].word.find(word.word)
-            if found_at >= 0 and found_at == len(self._shadow[i].word)-\
-                    len(word.word):
+            if self._shadow[i].word.endswith(word.word):
                found_before = True

        chain = found_before
@@ -171,19 +199,5 @@ class MistakeFinder:
        self._source = source
        self._shadow = shadow
        for word in self._shadow:
-            if word.mistake == Mistake.REPETITION or word.mistake\
-                    == Mistake.SEMANTIC:
+            if not word.correct and word.mistake != Mistake.RANDOM:
                print(str(word) + " " + str(word.mistake))
-
-    def _check_phonetic_mistake(self, shd_word):
-        """
-        Check whether a word can be seen as a phonetic mistake
-
-        Args:
-            shd_word: An instance of the Words class
-
-        Returns:
-            A boolean value indicating whether the word can be seen as a
-            phonetic mistake.
-        """
-        pass
--- a/umbra/needleman_wunsch.py
+++ b/umbra/needleman_wunsch.py
@@ -3,6 +3,7 @@ from alignment_strategy import AlignmentStrategy
 from words import Sentence
 import numpy as np
 from mistake_enum import Mistake
+from dutch_mmetaphone import DutchPhonetics


 class NeedlemanWunsch(AlignmentStrategy):
@@ -15,6 +16,7 @@ class NeedlemanWunsch(AlignmentStrategy):
        self._mismatch = -2
        self._gap_sc = -1
        self._seman_match = 2
+        self._phon_match = 2
        self._repetition = 0
        self._form_match = 2
        self._pointers = ['diag', 'up', 'left']
@@ -36,6 +38,8 @@ class NeedlemanWunsch(AlignmentStrategy):
             gap_sc: the score that is allocated for a gap
             seman_match: the score that is allocated when two words align
             by virtue of semantic equivalence
+             phon_match: the score that is allocated when two words sound the
+             same.
             repetition: the score that is allocated when a shadow word is
             a stuttering
        """
@@ -47,6 +51,8 @@ class NeedlemanWunsch(AlignmentStrategy):
            self._gap_sc = gap_sc
        if seman_match:
            self._seman_match = seman_match
+        if phon_match:
+            self._phon_match = phon_match
        if repetition:
            self._repetition = repetition
        if form_match:
@@ -96,15 +102,20 @@ class NeedlemanWunsch(AlignmentStrategy):
        n = len(self._source)
        m = len(self._shadow)
        for i in range(1, m+1):
+            shadow_word = self._shadow[i - 1]
            for j in range(1, n+1):
-                if self._source[j - 1] == self._shadow[i - 1]:
+                source_word = self._source[j - 1]
+                if source_word == shadow_word:
                    value = self._match
                elif self._form_checker.form_related(self._source[j-1].word,
                                                     self._shadow[i-1].word):
                    value = self._form_match
                elif self._seman_checker.semantically_related(
-                        self._source[j-1].word, self._shadow[i-1].word):
+                        source_word.word, shadow_word.word):
                    value = self._seman_match
+                elif DutchPhonetics.compare(source_word.word,
+                                            shadow_word.word):
+                    value = self._phon_match
                else:
                    value = self._mismatch
                match_value = self._matrix[i-1, j-1]['value'] + value
@@ -154,6 +165,8 @@ class NeedlemanWunsch(AlignmentStrategy):
                alignment_shadow.append(self._shadow[i - 1])
                if self._check_repetition(i - 1):
                    self._shadow[i - 1].mistake = Mistake.REPETITION
+                else:
+                    self._shadow[i - 1].mistake = Mistake.RANDOM
                i -= 1

        alignment_source.reverse()
@@ -194,4 +207,10 @@ class NeedlemanWunsch(AlignmentStrategy):
            source.shadowed = True
            source.mistake = Mistake.FORM
            shadow.mistake = Mistake.FORM
+        elif DutchPhonetics.compare(source.word, shadow.word):
+            source.shadowed = True
+            source.mistake = Mistake.PHONETIC
+            shadow.mistake = Mistake.PHONETIC
+        else:
+            shadow.mistake = Mistake.RANDOM
        return source, shadow
--- a/umbra/words.py
+++ b/umbra/words.py
@@ -16,6 +16,7 @@ class Word:
        return 1

    def __eq__(self, word):
+        # TODO: move self._onset < word.onset out of __eq__
        return self._word == word.word and self._onset < word.onset

    @property