Cleaned up code. Added exceptions. Added functionality to compare a source...

Cleaned up code. Added exceptions. Added functionality to compare a source with shadowed, a few changes need to still be discussed to make it fully integrated.

Cleaned up code. Added exceptions. Added functionality to compare a source...
Cleaned up code. Added exceptions. Added functionality to compare a source with shadowed, a few changes need to still be discussed to make it fully integrated.
cd6f0f00 · Nijsen, T · f44b67f9 · cd6f0f00
Commit cd6f0f00 authored 5 years ago by Nijsen, T
--- a/umbra/dutch_mmetaphone.py
+++ b/umbra/dutch_mmetaphone.py
-# This is a new version of the previous algorithm
+"""This is a new version of the previous algorithm
-# It is not a 'multiple' metaphone algorithm for
+   It is not a 'multiple' metaphone algorithm for
-# the Dutch language
+   the Dutch language
-# It now works with rule set for structures at
+   It now works with rule set for structures at
-# certain levels.
+   certain levels.
-# The phonetic representation is an adapted form
+   The phonetic representation is an adapted form
-# of SAMPA.
+  of SAMPA. """
 # Replacements: A : a>, S : s>, N : n>, h\ : h_, v\ : v_
+# '+' : indicates removal of recognized pattern
 import copy
 class DutchPhonetics:
    @staticmethod
-    def compare(word1, word2):
+    def compare_with_shadow(S_original, S_shadow):
+        """"Compare the phonetic representations of the original sentence and
+        the associated shadowed words.
+        Args: S_original is a list of SourceWord objects in which _source
+        is the associated shadowed word. (I was unclear whether this is how
+        the attribute is used. But can be changed later.)
+        Assumption: source attribute is always a Word object
+        Furthermore Words need a fon_correct attribute.
+        """
+        # Check 1:1 word matches
+        for original in S_original:
+            original.fon_correct = DutchPhonetics.compare_words(original.word, original.source.word)
+        # Check 1:2 word matches
+        for original in S_original:
+            if not original.fon_correct:
+                shadow_onset = original.source.onset
+                sh_idx = -1
+                for idx, shadow in S_shadow:
+                    onset = shadow.onset
+                    if shadow_onset == onset and idx+1 < len(S_shadow):
+                        sh_idx = idx + 1
+                shadow_combined = original.source.word + str(S_shadow[sh_idx])
+                original.fon_correct = DutchPhonetics.compare_words(original.word, shadow_combined)
+    @staticmethod
+    def compare_words(word1, word2):
+        """"Compare the phonetic representations of 2 source strings and see
+        if one of them matches. """
        word1_representations, word2_representations = DutchPhonetics.mmetaphone(word1), \
-                                                      DutchPhonetics.mmetaphone(word2)
+                                                       DutchPhonetics.mmetaphone(word2)
        return any(w1_rep in word2_representations for w1_rep in word1_representations)
    @staticmethod
    def mmetaphone(source):
+        """"Convert a source string to its possible phonetic representations."""
        source = source.upper()
        working_source = copy.deepcopy(source)  # Create a copy that we can manipulate
        representations = []
@@ -31,7 +63,9 @@ class DutchPhonetics:
    @staticmethod
    def derden(source, representations):
-        uitzonderingen = {'AIL': 'a>i', 'TSJ': 'ts>'}
+        """"Recognise 3 letter patterns in the source and convert them to their
+        phonetic representation. """
+        uitzonderingen = {'AIL': 'a>i', 'TSJ': 'ts>', 'ZIJ': 'zy>', 'JIJ': 'jy>', 'WIJ': 'wy>'}
        uz_len = 3
        new_rep = copy.deepcopy(source)  # New representation without ambiguities yet
@@ -39,11 +73,9 @@ class DutchPhonetics:
            uz_pos = source.find(uz)
            while uz_pos != -1:
                new_rep = new_rep[:uz_pos] + uitzonderingen[uz] + new_rep[uz_pos+uz_len:]
-                source = source[:uz_pos] + "+" + source[uz_pos+uz_len:]  # '+' : indicates removal of recognized piece
+                source = source[:uz_pos] + "+" + source[uz_pos+uz_len:]
                uz_pos = source.find(uz)
-        #print(source, new_rep)
        if new_rep not in representations:
            representations.append(new_rep)
@@ -51,60 +83,61 @@ class DutchPhonetics:
    @staticmethod
    def dubbelen(source, representations):
+        """"Recognise 2 letter patterns in the source and convert them to their
+        phonetic representation. """
        dubbel_tweeklank = {'AA': 'a:', 'EE': 'e:', 'IE': 'i', 'OO': 'o:', 'OE': 'u'}
        d_len = 2
        for dtk in dubbel_tweeklank:
            dtk_pos = source.find(dtk)
+            add_len = 0
+            addition = ''
            while dtk_pos != -1:
+                klank_fon = dubbel_tweeklank[dtk]
-                if dtk in {'AA', 'OO', 'OE'} and dtk_pos+d_len < len(source) and source[dtk_pos+d_len] == 'I':
+                if dtk in {'AA', 'OO', 'OE'} and dtk_pos+d_len < len(source) \
-                    source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 1:]
+                        and source[dtk_pos+d_len] == 'I':
-                    klank_fon = dubbel_tweeklank[dtk]
+                    addition = 'i'
-                    DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, 'i', 1)
+                    add_len = 1
-                if dtk in {'EE', 'IE'} and dtk_pos+d_len+1 < len(source) and source[dtk_pos+d_len:dtk_pos+d_len+2] == 'UW':
+                if dtk in {'EE', 'IE'} and dtk_pos+d_len+1 < len(source) \
-                    source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 2:]
+                        and source[dtk_pos+d_len:dtk_pos+d_len+2] == 'UW':
-                    klank_fon = dubbel_tweeklank[dtk]
+                    addition = 'u'
-                    DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, 'u', 2)
+                    add_len = 2
-                else:
-                    source = source[:dtk_pos] + "+" + source[dtk_pos + d_len:]
-                    klank_fon = dubbel_tweeklank[dtk]
-                    DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon)
+                DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, addition, add_len)
+                source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + add_len:]
                dtk_pos = source.find(dtk)
        dubbel = {'UU': 'y', 'EU': '2:', 'OE': 'u', 'UE': 'u:', 'IJ': 'e>i', 'EI': 'e>i',
-              'UI': '9y', 'OU': 'v>u', 'AU': 'v>u', 'SJ': 's>', 'NG': 'n>', 'DT': 't',
+                  'UI': '9y', 'OU': 'v>u', 'AU': 'v>u', 'SJ': 's>', 'NG': 'n>', 'DT': 't',
-              'TH': 't'}
+                  'TH': 't', 'ZE': 'zy>', 'JE': 'jy>', 'WE': 'wy>'}
        for d in dubbel:
            d_pos = source.find(d)
            while d_pos != -1:
+                DutchPhonetics.update_representations(representations, d, d_len, dubbel[d])
                source = source[:d_pos] + "+" + source[d_pos + d_len:]
-                klank_fon = dubbel[d]
-                DutchPhonetics.update_representations(representations, d, d_len, klank_fon)
                d_pos = source.find(d)
-        # e.g. CH kan 3 soorten klanken worden, afhankelijk van of het stemloos, stemhebbend of een
+        # e.g. CH can have 3 types of sound, depending on whether it is voiced, unvoiced or a word
-        # leen woord als 'chef' is.
+        # borrowed from another language. e.g. chef
        ambigu = {'CH': ('x', 'g>', 's>')}
        for amb in ambigu:
            amb_pos = source.find(amb)
            while amb_pos != -1:
+                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, d_len, ambigu[amb])
                source = source[:amb_pos] + "+" + source[amb_pos + d_len:]
-                amb_klanken = ambigu[amb]
-                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, d_len, amb_klanken)
                amb_pos = source.find(amb)
-        #print(source, representations)
        return source, representations
    @staticmethod
    def enkelen(source, representations):
+        """"Recognise single letter patterns in the source and convert them to their
+        phonetic representation. """
        stemloos = {'P': 'p', 'T': 't', 'K': 'k', 'F': 'f', 'S': 's'}
        stemhebbend = {'B': 'b', 'D': 'd', 'V': 'v', 'Z': 'z', 'G': 'g>',
                       'H': 'h_', 'L': 'l', 'R': 'r', 'M': 'm', 'N': 'n',
@@ -123,7 +156,7 @@ class DutchPhonetics:
                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
                else:
                    klank_fon = klinkers[letter]
-                    # Klinker zonder dubbele medeklinker wordt langer geluid
+                    # Klinker without double medeklinker becomes a longer sound
                    if l_idx + 2 < len(source):
                        if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \
                                and not (source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend):
@@ -152,14 +185,14 @@ class DutchPhonetics:
            if letter in stemhebbend:
                source = source[:l_idx] + "+" + source[l_idx + 1:]
-                if l_idx == len(source)-1 and letter in verstemlozing:  # eindklankverscherping
+                if l_idx == len(source)-1 and letter in verstemlozing:  # final devoicing
                    klank_fon = verstemlozing[letter]
                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
                else:
                    klank_fon = stemhebbend[letter]
                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-        # e.g. journaal & jas
+        # e.g. journaal & jas or xanten & xenofobie
        ambigu = {'J': ('z>', 'j', 'dz>'), 'X': ('ks', 'z')}
        for amb in ambigu:
            amb_pos = source.find(amb)
@@ -169,11 +202,11 @@ class DutchPhonetics:
                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken)
                amb_pos = source.find(amb)
-        # letters met diakritische tekens. Tekens in NL: {accent aigu : é, accent grave: è, trema: ë,
+        """ Letters with  diacritic signs. Sign in Dutch: {accent aigu : é, accent grave: è, trema: ë,
-        # umlaut: ü, accent circonflexe: ê, cedille: ç}
+            umlaut: ü, accent circonflexe: ê, cedille: ç}
-        # Not all possibilities are in here because not everything can be written in Python here
+         Not all possibilities are in here because not everything can be written in Python. 
-        # So these have been excluded
+         So these have been excluded.
-        # Cedille has only 2 letters associated and they are used very infrequently so its not included
+         Cedille has only 2 letters associated and they are used very infrequently so its not included """
        aigu = {'Á': 'a:', 'É': 'e:', 'Í': 'i', 'Ó': 'o:', 'Ú': 'y', 'Ý': 'e>i'}
        grave_circonflexe = {'À': 'a>', 'È': 'e>', 'Ì': 'i>', 'Ò': 'o>', 'Ù': 'y>',
                             'Â': 'a>', 'Ê': 'e>', 'Î': 'i>', 'Ô': 'o>', 'Û': 'y>'}
@@ -201,9 +234,8 @@ class DutchPhonetics:
        for amb in trema_umlaut:
            amb_pos = source.find(amb)
            while amb_pos != -1:
+                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, ambigu[amb])
                source = source[:amb_pos] + "+" + source[amb_pos + 1:]
-                amb_klanken = ambigu[amb]
-                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken)
                amb_pos = source.find(amb)
        print(source, representations)