From f44b67f9f820ebc1f58598daacf13bdf68e4dab4 Mon Sep 17 00:00:00 2001
From: "Nijsen, T" <s1006955@ru.nl>
Date: Tue, 3 Dec 2019 13:23:07 +0100
Subject: [PATCH] Tested the algorithm on the test data and tried implementing
 another rule. Lastly inspected the results.

---
 phonetics test/dutch_mmetaphone.py |   6 +-
 umbra/dutch_mmetaphone.py          | 252 +++++++++++++++++++++++++++++
 umbra/statistics.py                |   8 +-
 umbra/words.py                     |   2 +-
 4 files changed, 263 insertions(+), 5 deletions(-)
 create mode 100644 umbra/dutch_mmetaphone.py

diff --git a/phonetics test/dutch_mmetaphone.py b/phonetics test/dutch_mmetaphone.py
index f474f951..de4b49a4 100644
--- a/phonetics test/dutch_mmetaphone.py	
+++ b/phonetics test/dutch_mmetaphone.py	
@@ -118,7 +118,6 @@ class DutchPhonetics:
         # Deal only with klinkers
         for l_idx, letter in enumerate(source):
             if letter in klinkers:
-                source = source[:l_idx] + "+" + source[l_idx+1:]
                 if letter == 'E' and l_idx == len(source)-1:  # 'E' at the end of a word becomes an 'uh' sound
                     klank_fon = 'y>'
                     DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
@@ -127,9 +126,10 @@ class DutchPhonetics:
                     # Klinker zonder dubbele medeklinker wordt langer geluid
                     if l_idx + 2 < len(source):
                         if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \
-                                and not(source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend):
+                                and not (source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend):
                             klank_fon = klank_fon[0] + ':'
                     DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
 
         # Deal only with medeklinkers
         for l_idx, letter in enumerate(source):
@@ -235,6 +235,7 @@ DutchPhonetics.mmetaphone("Detail")
 DutchPhonetics.mmetaphone("Haai")
 DutchPhonetics.mmetaphone("Leeuw")
 DutchPhonetics.mmetaphone("Lach")
+DutchPhonetics.mmetaphone("Lag")
 DutchPhonetics.mmetaphone('Jazz')  # leenwoord, representatie kan niet 100% worden opgevangen tenzij als special case
 DutchPhonetics.mmetaphone("Handvat")  # 'dv' does not become a 'tf' sound yet
 DutchPhonetics.mmetaphone("Hand")
@@ -246,3 +247,4 @@ DutchPhonetics.mmetaphone("Pet")
 DutchPhonetics.mmetaphone("Petten")
 DutchPhonetics.mmetaphone("Peter")
 DutchPhonetics.mmetaphone("FeeÃ«n")
+DutchPhonetics.mmetaphone("Schakelen")
diff --git a/umbra/dutch_mmetaphone.py b/umbra/dutch_mmetaphone.py
new file mode 100644
index 00000000..f75df5a0
--- /dev/null
+++ b/umbra/dutch_mmetaphone.py
@@ -0,0 +1,252 @@
+# This is a new version of the previous algorithm
+# It is not a 'multiple' metaphone algorithm for
+# the Dutch language
+# It now works with rule set for structures at
+# certain levels.
+# The phonetic representation is an adapted form
+# of SAMPA.
+# Replacements: A : a>, S : s>, N : n>, h\ : h_, v\ : v_
+import copy
+
+
+class DutchPhonetics:
+    @staticmethod
+    def compare(word1, word2):
+        word1_representations, word2_representations = DutchPhonetics.mmetaphone(word1), \
+                                                      DutchPhonetics.mmetaphone(word2)
+        return any(w1_rep in word2_representations for w1_rep in word1_representations)
+
+    @staticmethod
+    def mmetaphone(source):
+        source = source.upper()
+        working_source = copy.deepcopy(source)  # Create a copy that we can manipulate
+        representations = []
+
+        structure_rule_sets = [DutchPhonetics.derden, DutchPhonetics.dubbelen, DutchPhonetics.enkelen]
+
+        for rule_set in structure_rule_sets:
+            working_source, representations = rule_set(working_source, representations)
+
+        return representations
+
+    @staticmethod
+    def derden(source, representations):
+        uitzonderingen = {'AIL': 'a>i', 'TSJ': 'ts>'}
+        uz_len = 3
+        new_rep = copy.deepcopy(source)  # New representation without ambiguities yet
+
+        for uz in uitzonderingen:
+            uz_pos = source.find(uz)
+            while uz_pos != -1:
+                new_rep = new_rep[:uz_pos] + uitzonderingen[uz] + new_rep[uz_pos+uz_len:]
+                source = source[:uz_pos] + "+" + source[uz_pos+uz_len:]  # '+' : indicates removal of recognized piece
+                uz_pos = source.find(uz)
+
+        #print(source, new_rep)
+
+        if new_rep not in representations:
+            representations.append(new_rep)
+
+        return source, representations
+
+    @staticmethod
+    def dubbelen(source, representations):
+        dubbel_tweeklank = {'AA': 'a:', 'EE': 'e:', 'IE': 'i', 'OO': 'o:', 'OE': 'u'}
+        d_len = 2
+
+        for dtk in dubbel_tweeklank:
+            dtk_pos = source.find(dtk)
+            while dtk_pos != -1:
+
+                if dtk in {'AA', 'OO', 'OE'} and dtk_pos+d_len < len(source) and source[dtk_pos+d_len] == 'I':
+                    source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 1:]
+                    klank_fon = dubbel_tweeklank[dtk]
+                    DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, 'i', 1)
+
+                if dtk in {'EE', 'IE'} and dtk_pos+d_len+1 < len(source) and source[dtk_pos+d_len:dtk_pos+d_len+2] == 'UW':
+                    source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 2:]
+                    klank_fon = dubbel_tweeklank[dtk]
+                    DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, 'u', 2)
+
+                else:
+                    source = source[:dtk_pos] + "+" + source[dtk_pos + d_len:]
+                    klank_fon = dubbel_tweeklank[dtk]
+                    DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon)
+
+                dtk_pos = source.find(dtk)
+
+        dubbel = {'UU': 'y', 'EU': '2:', 'OE': 'u', 'UE': 'u:', 'IJ': 'e>i', 'EI': 'e>i',
+              'UI': '9y', 'OU': 'v>u', 'AU': 'v>u', 'SJ': 's>', 'NG': 'n>', 'DT': 't',
+              'TH': 't'}
+
+        for d in dubbel:
+            d_pos = source.find(d)
+            while d_pos != -1:
+                source = source[:d_pos] + "+" + source[d_pos + d_len:]
+                klank_fon = dubbel[d]
+                DutchPhonetics.update_representations(representations, d, d_len, klank_fon)
+                d_pos = source.find(d)
+
+        # e.g. CH kan 3 soorten klanken worden, afhankelijk van of het stemloos, stemhebbend of een
+        # leen woord als 'chef' is.
+        ambigu = {'CH': ('x', 'g>', 's>')}
+
+        for amb in ambigu:
+            amb_pos = source.find(amb)
+            while amb_pos != -1:
+                source = source[:amb_pos] + "+" + source[amb_pos + d_len:]
+                amb_klanken = ambigu[amb]
+                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, d_len, amb_klanken)
+                amb_pos = source.find(amb)
+
+        #print(source, representations)
+
+        return source, representations
+
+    @staticmethod
+    def enkelen(source, representations):
+        stemloos = {'P': 'p', 'T': 't', 'K': 'k', 'F': 'f', 'S': 's'}
+        stemhebbend = {'B': 'b', 'D': 'd', 'V': 'v', 'Z': 'z', 'G': 'g>',
+                       'H': 'h_', 'L': 'l', 'R': 'r', 'M': 'm', 'N': 'n',
+                       'W': 'v_'}
+
+        verstemlozing = {'V': 'f', 'Z': 's', 'G': 'x', 'D': 't'}
+        verstemhebbing = {'F': 'v', 'S': 'z', 'P': 'b', 'T': 'd', 'K': 'g'}
+
+        klinkers = {'A': 'a>', 'E': 'e>', 'I': 'i>', 'O': 'o>', 'U': 'y>'}
+
+        # Deal only with klinkers
+        for l_idx, letter in enumerate(source):
+            if letter in klinkers:
+                if letter == 'E' and l_idx == len(source)-1:  # 'E' at the end of a word becomes an 'uh' sound
+                    klank_fon = 'y>'
+                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+                else:
+                    klank_fon = klinkers[letter]
+                    # Klinker zonder dubbele medeklinker wordt langer geluid
+                    if l_idx + 2 < len(source):
+                        if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \
+                                and not (source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend):
+                            klank_fon = klank_fon[0] + ':'
+                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
+
+        # Deal only with medeklinkers
+        for l_idx, letter in enumerate(source):
+            if letter in stemloos and l_idx+1 < len(source) and source[l_idx+1] in stemhebbend\
+                    and letter in verstemhebbing:
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
+                klank_fon = verstemhebbing[letter]
+                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+
+            if letter in stemhebbend and l_idx + 1 < len(source) and source[l_idx + 1] in stemloos\
+                    and letter in verstemlozing:
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
+                klank_fon = verstemlozing[letter]
+                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+
+            if letter in stemloos:
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
+                klank_fon = stemloos[letter]
+                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+
+            if letter in stemhebbend:
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
+                if l_idx == len(source)-1 and letter in verstemlozing:  # eindklankverscherping
+                    klank_fon = verstemlozing[letter]
+                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+                else:
+                    klank_fon = stemhebbend[letter]
+                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+
+        # e.g. journaal & jas
+        ambigu = {'J': ('z>', 'j', 'dz>'), 'X': ('ks', 'z')}
+        for amb in ambigu:
+            amb_pos = source.find(amb)
+            while amb_pos != -1:
+                source = source[:amb_pos] + "+" + source[amb_pos + 1:]
+                amb_klanken = ambigu[amb]
+                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken)
+                amb_pos = source.find(amb)
+
+        # letters met diakritische tekens. Tekens in NL: {accent aigu : Ã©, accent grave: Ã¨, trema: Ã«,
+        # umlaut: Ã¼, accent circonflexe: Ãª, cedille: Ã§}
+        # Not all possibilities are in here because not everything can be written in Python here
+        # So these have been excluded
+        # Cedille has only 2 letters associated and they are used very infrequently so its not included
+        aigu = {'Ã': 'a:', 'Ã‰': 'e:', 'Ã': 'i', 'Ã“': 'o:', 'Ãš': 'y', 'Ã': 'e>i'}
+        grave_circonflexe = {'Ã€': 'a>', 'Ãˆ': 'e>', 'ÃŒ': 'i>', 'Ã’': 'o>', 'Ã™': 'y>',
+                             'Ã‚': 'a>', 'ÃŠ': 'e>', 'ÃŽ': 'i>', 'Ã”': 'o>', 'Ã›': 'y>'}
+        trema = {'Ã‹': 'e>', 'Ã': 'i>'}
+        overig = {'Y': 'e>i', 'Q': 'k'}
+        for l_idx, letter in enumerate(source):
+            if letter in aigu:
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
+                klank_fon = aigu[letter]
+                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+            elif letter in grave_circonflexe:
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
+                klank_fon = grave_circonflexe[letter]
+                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+            elif letter in trema:
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
+                klank_fon = trema[letter]
+                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+            elif letter != "+" and letter in overig:
+                source = source[:l_idx] + "+" + source[l_idx + 1:]
+                klank_fon = overig[letter]
+                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
+
+        trema_umlaut = {'Ã„': ('a>', 'e>'), 'Ã–': ('o>', '2:'), 'Ãœ': ('y>', 'y')}  # Heavily irregular symbol
+        for amb in trema_umlaut:
+            amb_pos = source.find(amb)
+            while amb_pos != -1:
+                source = source[:amb_pos] + "+" + source[amb_pos + 1:]
+                amb_klanken = ambigu[amb]
+                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken)
+                amb_pos = source.find(amb)
+
+        print(source, representations)
+
+        return source, representations
+
+    @staticmethod
+    def update_rep_ambiguous(representations, amb_spelling, spel_len, amb_klanken):
+        all_new_reps = []
+        for amb_klank in amb_klanken:
+            n_representations = copy.deepcopy(representations)
+            DutchPhonetics.update_representations(n_representations, amb_spelling, spel_len, amb_klank)
+            all_new_reps.extend(n_representations)
+
+        return all_new_reps
+
+    @staticmethod
+    def update_representations(representations, spelling, spel_len, klank, additie="", additie_len=0):
+        for rep_idx, rep in enumerate(representations):
+            spelling_pos = rep.find(spelling)
+            if spelling_pos != -1:
+                fon_rep = klank + additie
+                tot_len = spel_len + additie_len
+                representations[rep_idx] = rep[:spelling_pos] + fon_rep + rep[spelling_pos + tot_len:]
+
+
+# Test examples
+#DutchPhonetics.mmetaphone("Detail")
+#DutchPhonetics.mmetaphone("Haai")
+#DutchPhonetics.mmetaphone("Leeuw")
+#DutchPhonetics.mmetaphone("Lach")
+#DutchPhonetics.mmetaphone("Lag")
+#DutchPhonetics.mmetaphone('Jazz')  # leenwoord, representatie kan niet 100% worden opgevangen tenzij als special case
+#DutchPhonetics.mmetaphone("Handvat")  # 'dv' does not become a 'tf' sound yet
+#DutchPhonetics.mmetaphone("Hand")
+#DutchPhonetics.mmetaphone("Weggelopen")
+# 'gg' does not become a 'xg>' sound yet and because there is only a single medeklinker after, the second 'e' becomes
+# an 'e:' sound, should be more like a 'y' sound but question is whether 'ge' is a pattern often enough such that it
+# can be used a special case of 2 letters.
+#DutchPhonetics.mmetaphone("Pet")
+#DutchPhonetics.mmetaphone("Petten")
+#DutchPhonetics.mmetaphone("Peter")
+#DutchPhonetics.mmetaphone("FeeÃ«n")
+#DutchPhonetics.mmetaphone("Schakelen")
+#DutchPhonetics.mmetaphone("Ik")
+#DutchPhonetics.mmetaphone("Radioprogramma")
diff --git a/umbra/statistics.py b/umbra/statistics.py
index 4ae427b4..0b6a7611 100644
--- a/umbra/statistics.py
+++ b/umbra/statistics.py
@@ -1,6 +1,7 @@
 from saa_algorithm import SaaAlgorithm
 from saa_Romeo import SaaRomeo
 from anchor_algorithm import AnchorAlgorithm
+from dutch_mmetaphone import DutchPhonetics as dp
 
 
 class Statistics:
@@ -31,9 +32,12 @@ class Statistics:
         print('Romeo')
         self._strategy = SaaRomeo()
         source_align, shadow_align = self._strategy.align(source, shadow)
-        print([str(x) for x in source_align.words])
-        print([str(x) for x in shadow_align.words])
+        source_words = [str(x) for x in source_align.words]
+
+        #print([str(x) for x in source_align.words])
+        #print([str(x) for x in shadow_align.words])
         correctness = self._strategy.correctly_shadowed(source)
+        fon_reps = [dp.mmetaphone(x) for x in source_words]
 
         # Reset the is_shadowed property
         for word in source.words:
diff --git a/umbra/words.py b/umbra/words.py
index 4d206516..2936c860 100644
--- a/umbra/words.py
+++ b/umbra/words.py
@@ -6,7 +6,7 @@ class Word:
         self._anchor = None
 
     def __str__(self):
-        return "%s | %f | %f" % (self._word, self._onset, self._offset)
+        return self._word
 
     def __len__(self):
         return 1
-- 
GitLab