diff --git a/phonetics test/dutch_mmetaphone.py b/phonetics test/dutch_mmetaphone.py index f474f9519cdf78ccb4e29b5db226ae5313702ded..de4b49a42517ebd7ef5d6eff2dc9a0425f5d3f80 100644 --- a/phonetics test/dutch_mmetaphone.py +++ b/phonetics test/dutch_mmetaphone.py @@ -118,7 +118,6 @@ class DutchPhonetics: # Deal only with klinkers for l_idx, letter in enumerate(source): if letter in klinkers: - source = source[:l_idx] + "+" + source[l_idx+1:] if letter == 'E' and l_idx == len(source)-1: # 'E' at the end of a word becomes an 'uh' sound klank_fon = 'y>' DutchPhonetics.update_representations(representations, letter, 1, klank_fon) @@ -127,9 +126,10 @@ class DutchPhonetics: # Klinker zonder dubbele medeklinker wordt langer geluid if l_idx + 2 < len(source): if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \ - and not(source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend): + and not (source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend): klank_fon = klank_fon[0] + ':' DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + source = source[:l_idx] + "+" + source[l_idx + 1:] # Deal only with medeklinkers for l_idx, letter in enumerate(source): @@ -235,6 +235,7 @@ DutchPhonetics.mmetaphone("Detail") DutchPhonetics.mmetaphone("Haai") DutchPhonetics.mmetaphone("Leeuw") DutchPhonetics.mmetaphone("Lach") +DutchPhonetics.mmetaphone("Lag") DutchPhonetics.mmetaphone('Jazz') # leenwoord, representatie kan niet 100% worden opgevangen tenzij als special case DutchPhonetics.mmetaphone("Handvat") # 'dv' does not become a 'tf' sound yet DutchPhonetics.mmetaphone("Hand") @@ -246,3 +247,4 @@ DutchPhonetics.mmetaphone("Pet") DutchPhonetics.mmetaphone("Petten") DutchPhonetics.mmetaphone("Peter") DutchPhonetics.mmetaphone("Feeën") +DutchPhonetics.mmetaphone("Schakelen") diff --git a/umbra/dutch_mmetaphone.py b/umbra/dutch_mmetaphone.py new file mode 100644 index 0000000000000000000000000000000000000000..f75df5a0e0eb51a7b49632d4396ed4a877ce74b2 --- /dev/null +++ b/umbra/dutch_mmetaphone.py @@ -0,0 +1,252 @@ +# This is a new version of the previous algorithm +# It is not a 'multiple' metaphone algorithm for +# the Dutch language +# It now works with rule set for structures at +# certain levels. +# The phonetic representation is an adapted form +# of SAMPA. +# Replacements: A : a>, S : s>, N : n>, h\ : h_, v\ : v_ +import copy + + +class DutchPhonetics: + @staticmethod + def compare(word1, word2): + word1_representations, word2_representations = DutchPhonetics.mmetaphone(word1), \ + DutchPhonetics.mmetaphone(word2) + return any(w1_rep in word2_representations for w1_rep in word1_representations) + + @staticmethod + def mmetaphone(source): + source = source.upper() + working_source = copy.deepcopy(source) # Create a copy that we can manipulate + representations = [] + + structure_rule_sets = [DutchPhonetics.derden, DutchPhonetics.dubbelen, DutchPhonetics.enkelen] + + for rule_set in structure_rule_sets: + working_source, representations = rule_set(working_source, representations) + + return representations + + @staticmethod + def derden(source, representations): + uitzonderingen = {'AIL': 'a>i', 'TSJ': 'ts>'} + uz_len = 3 + new_rep = copy.deepcopy(source) # New representation without ambiguities yet + + for uz in uitzonderingen: + uz_pos = source.find(uz) + while uz_pos != -1: + new_rep = new_rep[:uz_pos] + uitzonderingen[uz] + new_rep[uz_pos+uz_len:] + source = source[:uz_pos] + "+" + source[uz_pos+uz_len:] # '+' : indicates removal of recognized piece + uz_pos = source.find(uz) + + #print(source, new_rep) + + if new_rep not in representations: + representations.append(new_rep) + + return source, representations + + @staticmethod + def dubbelen(source, representations): + dubbel_tweeklank = {'AA': 'a:', 'EE': 'e:', 'IE': 'i', 'OO': 'o:', 'OE': 'u'} + d_len = 2 + + for dtk in dubbel_tweeklank: + dtk_pos = source.find(dtk) + while dtk_pos != -1: + + if dtk in {'AA', 'OO', 'OE'} and dtk_pos+d_len < len(source) and source[dtk_pos+d_len] == 'I': + source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 1:] + klank_fon = dubbel_tweeklank[dtk] + DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, 'i', 1) + + if dtk in {'EE', 'IE'} and dtk_pos+d_len+1 < len(source) and source[dtk_pos+d_len:dtk_pos+d_len+2] == 'UW': + source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 2:] + klank_fon = dubbel_tweeklank[dtk] + DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, 'u', 2) + + else: + source = source[:dtk_pos] + "+" + source[dtk_pos + d_len:] + klank_fon = dubbel_tweeklank[dtk] + DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon) + + dtk_pos = source.find(dtk) + + dubbel = {'UU': 'y', 'EU': '2:', 'OE': 'u', 'UE': 'u:', 'IJ': 'e>i', 'EI': 'e>i', + 'UI': '9y', 'OU': 'v>u', 'AU': 'v>u', 'SJ': 's>', 'NG': 'n>', 'DT': 't', + 'TH': 't'} + + for d in dubbel: + d_pos = source.find(d) + while d_pos != -1: + source = source[:d_pos] + "+" + source[d_pos + d_len:] + klank_fon = dubbel[d] + DutchPhonetics.update_representations(representations, d, d_len, klank_fon) + d_pos = source.find(d) + + # e.g. CH kan 3 soorten klanken worden, afhankelijk van of het stemloos, stemhebbend of een + # leen woord als 'chef' is. + ambigu = {'CH': ('x', 'g>', 's>')} + + for amb in ambigu: + amb_pos = source.find(amb) + while amb_pos != -1: + source = source[:amb_pos] + "+" + source[amb_pos + d_len:] + amb_klanken = ambigu[amb] + representations = DutchPhonetics.update_rep_ambiguous(representations, amb, d_len, amb_klanken) + amb_pos = source.find(amb) + + #print(source, representations) + + return source, representations + + @staticmethod + def enkelen(source, representations): + stemloos = {'P': 'p', 'T': 't', 'K': 'k', 'F': 'f', 'S': 's'} + stemhebbend = {'B': 'b', 'D': 'd', 'V': 'v', 'Z': 'z', 'G': 'g>', + 'H': 'h_', 'L': 'l', 'R': 'r', 'M': 'm', 'N': 'n', + 'W': 'v_'} + + verstemlozing = {'V': 'f', 'Z': 's', 'G': 'x', 'D': 't'} + verstemhebbing = {'F': 'v', 'S': 'z', 'P': 'b', 'T': 'd', 'K': 'g'} + + klinkers = {'A': 'a>', 'E': 'e>', 'I': 'i>', 'O': 'o>', 'U': 'y>'} + + # Deal only with klinkers + for l_idx, letter in enumerate(source): + if letter in klinkers: + if letter == 'E' and l_idx == len(source)-1: # 'E' at the end of a word becomes an 'uh' sound + klank_fon = 'y>' + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + else: + klank_fon = klinkers[letter] + # Klinker zonder dubbele medeklinker wordt langer geluid + if l_idx + 2 < len(source): + if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \ + and not (source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend): + klank_fon = klank_fon[0] + ':' + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + source = source[:l_idx] + "+" + source[l_idx + 1:] + + # Deal only with medeklinkers + for l_idx, letter in enumerate(source): + if letter in stemloos and l_idx+1 < len(source) and source[l_idx+1] in stemhebbend\ + and letter in verstemhebbing: + source = source[:l_idx] + "+" + source[l_idx + 1:] + klank_fon = verstemhebbing[letter] + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + + if letter in stemhebbend and l_idx + 1 < len(source) and source[l_idx + 1] in stemloos\ + and letter in verstemlozing: + source = source[:l_idx] + "+" + source[l_idx + 1:] + klank_fon = verstemlozing[letter] + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + + if letter in stemloos: + source = source[:l_idx] + "+" + source[l_idx + 1:] + klank_fon = stemloos[letter] + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + + if letter in stemhebbend: + source = source[:l_idx] + "+" + source[l_idx + 1:] + if l_idx == len(source)-1 and letter in verstemlozing: # eindklankverscherping + klank_fon = verstemlozing[letter] + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + else: + klank_fon = stemhebbend[letter] + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + + # e.g. journaal & jas + ambigu = {'J': ('z>', 'j', 'dz>'), 'X': ('ks', 'z')} + for amb in ambigu: + amb_pos = source.find(amb) + while amb_pos != -1: + source = source[:amb_pos] + "+" + source[amb_pos + 1:] + amb_klanken = ambigu[amb] + representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken) + amb_pos = source.find(amb) + + # letters met diakritische tekens. Tekens in NL: {accent aigu : é, accent grave: è, trema: ë, + # umlaut: ü, accent circonflexe: ê, cedille: ç} + # Not all possibilities are in here because not everything can be written in Python here + # So these have been excluded + # Cedille has only 2 letters associated and they are used very infrequently so its not included + aigu = {'Ã': 'a:', 'É': 'e:', 'Ã': 'i', 'Ó': 'o:', 'Ú': 'y', 'Ã': 'e>i'} + grave_circonflexe = {'À': 'a>', 'È': 'e>', 'ÃŒ': 'i>', 'Ã’': 'o>', 'Ù': 'y>', + 'Â': 'a>', 'Ê': 'e>', 'ÃŽ': 'i>', 'Ô': 'o>', 'Û': 'y>'} + trema = {'Ë': 'e>', 'Ã': 'i>'} + overig = {'Y': 'e>i', 'Q': 'k'} + for l_idx, letter in enumerate(source): + if letter in aigu: + source = source[:l_idx] + "+" + source[l_idx + 1:] + klank_fon = aigu[letter] + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + elif letter in grave_circonflexe: + source = source[:l_idx] + "+" + source[l_idx + 1:] + klank_fon = grave_circonflexe[letter] + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + elif letter in trema: + source = source[:l_idx] + "+" + source[l_idx + 1:] + klank_fon = trema[letter] + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + elif letter != "+" and letter in overig: + source = source[:l_idx] + "+" + source[l_idx + 1:] + klank_fon = overig[letter] + DutchPhonetics.update_representations(representations, letter, 1, klank_fon) + + trema_umlaut = {'Ä': ('a>', 'e>'), 'Ö': ('o>', '2:'), 'Ãœ': ('y>', 'y')} # Heavily irregular symbol + for amb in trema_umlaut: + amb_pos = source.find(amb) + while amb_pos != -1: + source = source[:amb_pos] + "+" + source[amb_pos + 1:] + amb_klanken = ambigu[amb] + representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken) + amb_pos = source.find(amb) + + print(source, representations) + + return source, representations + + @staticmethod + def update_rep_ambiguous(representations, amb_spelling, spel_len, amb_klanken): + all_new_reps = [] + for amb_klank in amb_klanken: + n_representations = copy.deepcopy(representations) + DutchPhonetics.update_representations(n_representations, amb_spelling, spel_len, amb_klank) + all_new_reps.extend(n_representations) + + return all_new_reps + + @staticmethod + def update_representations(representations, spelling, spel_len, klank, additie="", additie_len=0): + for rep_idx, rep in enumerate(representations): + spelling_pos = rep.find(spelling) + if spelling_pos != -1: + fon_rep = klank + additie + tot_len = spel_len + additie_len + representations[rep_idx] = rep[:spelling_pos] + fon_rep + rep[spelling_pos + tot_len:] + + +# Test examples +#DutchPhonetics.mmetaphone("Detail") +#DutchPhonetics.mmetaphone("Haai") +#DutchPhonetics.mmetaphone("Leeuw") +#DutchPhonetics.mmetaphone("Lach") +#DutchPhonetics.mmetaphone("Lag") +#DutchPhonetics.mmetaphone('Jazz') # leenwoord, representatie kan niet 100% worden opgevangen tenzij als special case +#DutchPhonetics.mmetaphone("Handvat") # 'dv' does not become a 'tf' sound yet +#DutchPhonetics.mmetaphone("Hand") +#DutchPhonetics.mmetaphone("Weggelopen") +# 'gg' does not become a 'xg>' sound yet and because there is only a single medeklinker after, the second 'e' becomes +# an 'e:' sound, should be more like a 'y' sound but question is whether 'ge' is a pattern often enough such that it +# can be used a special case of 2 letters. +#DutchPhonetics.mmetaphone("Pet") +#DutchPhonetics.mmetaphone("Petten") +#DutchPhonetics.mmetaphone("Peter") +#DutchPhonetics.mmetaphone("Feeën") +#DutchPhonetics.mmetaphone("Schakelen") +#DutchPhonetics.mmetaphone("Ik") +#DutchPhonetics.mmetaphone("Radioprogramma") diff --git a/umbra/statistics.py b/umbra/statistics.py index 4ae427b4fc6d2f41ab883c7ec466cb5d6e3dc0d3..0b6a761149917a9914de3f85eeaf1e4ccbcaab65 100644 --- a/umbra/statistics.py +++ b/umbra/statistics.py @@ -1,6 +1,7 @@ from saa_algorithm import SaaAlgorithm from saa_Romeo import SaaRomeo from anchor_algorithm import AnchorAlgorithm +from dutch_mmetaphone import DutchPhonetics as dp class Statistics: @@ -31,9 +32,12 @@ class Statistics: print('Romeo') self._strategy = SaaRomeo() source_align, shadow_align = self._strategy.align(source, shadow) - print([str(x) for x in source_align.words]) - print([str(x) for x in shadow_align.words]) + source_words = [str(x) for x in source_align.words] + + #print([str(x) for x in source_align.words]) + #print([str(x) for x in shadow_align.words]) correctness = self._strategy.correctly_shadowed(source) + fon_reps = [dp.mmetaphone(x) for x in source_words] # Reset the is_shadowed property for word in source.words: diff --git a/umbra/words.py b/umbra/words.py index 4d206516ec7d7f8dbb721da7f289bd93c961f18b..2936c860eb489272d4239fc1893c5414342ffc69 100644 --- a/umbra/words.py +++ b/umbra/words.py @@ -6,7 +6,7 @@ class Word: self._anchor = None def __str__(self): - return "%s | %f | %f" % (self._word, self._onset, self._offset) + return self._word def __len__(self): return 1