Skip to content
Snippets Groups Projects
Commit 786d068f authored by Nijsen, T's avatar Nijsen, T
Browse files

Extended the algorithm to deal with diacritic symbols where possible....

Extended the algorithm to deal with diacritic symbols where possible. Furthermore put all functions in a class.
parent ca1f288f
No related branches found
No related tags found
1 merge request!60Sc 111/dmetaphone
......@@ -9,178 +9,240 @@
import copy
def mmetaphone(source):
source = source.upper()
working_source = copy.deepcopy(source) # Create a copy that we can manipulate
representations = []
structure_rule_sets = [derden, dubbelen, enkelen]
for rule_set in structure_rule_sets:
working_source, representations = rule_set(working_source, representations)
return representations
def derden(source, representations):
uitzonderingen = {'AIL': 'a>i', 'TSJ': 'ts>'}
uz_len = 3
new_rep = copy.deepcopy(source) # New representation without ambiguities yet
for uz in uitzonderingen:
uz_pos = source.find(uz)
if uz_pos != -1:
new_rep = new_rep[:uz_pos] + uitzonderingen[uz] + new_rep[uz_pos+uz_len:]
source = source[:uz_pos] + "+" + source[uz_pos+uz_len:] # '+' : indicates removal of recognized piece
#print(source, new_rep)
if new_rep not in representations:
representations.append(new_rep)
return source, representations
def dubbelen(source, representations):
dubbel_tweeklank = {'AA': 'a:', 'EE': 'e:', 'IE': 'i', 'OO': 'o:', 'OE': 'u'}
d_len = 2
for dtk in dubbel_tweeklank:
dtk_pos = source.find(dtk)
if dtk_pos != -1:
if dtk in {'AA', 'OO', 'OE'} and dtk_pos+d_len < len(source) and source[dtk_pos+d_len] == 'I':
source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 1:]
klank_fon = dubbel_tweeklank[dtk]
update_representations(representations, dtk, d_len, klank_fon, 'i', 1)
if dtk in {'EE', 'IE'} and dtk_pos+d_len+1 < len(source) and source[dtk_pos+d_len:dtk_pos+d_len+2] == 'UW':
source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 2:]
klank_fon = dubbel_tweeklank[dtk]
update_representations(representations, dtk, d_len, klank_fon, 'u', 2)
dubbel = {'UU': 'y', 'EU': '2:', 'OE': 'u', 'UE': 'u:', 'IJ': 'e>i', 'EI': 'e>i',
'UI': '9y', 'OU': 'v>u', 'AU': 'v>u', 'SJ': 's>', 'NG': 'n>', 'DT': 't'}
for d in dubbel:
d_pos = source.find(d)
if d_pos != -1:
source = source[:d_pos] + "+" + source[d_pos + d_len:]
klank_fon = dubbel_tweeklank[d]
update_representations(representations, d, d_len, klank_fon)
# e.g. CH kan 3 soorten klanken worden, afhankelijk van of het stemloos, stemhebbend of een
# leen woord als 'chef' is.
ambigu = {'CH': ('x', 'g>', 's>')}
for amb in ambigu:
amb_pos = source.find(amb)
if amb_pos != -1:
source = source[:amb_pos] + "+" + source[amb_pos + d_len:]
amb_klanken = ambigu[amb]
representations = update_rep_ambiguous(representations, amb, d_len, amb_klanken)
#print(source, representations)
return source, representations
def enkelen(source, representations):
stemloos = {'P': 'p', 'T': 't', 'K': 'k', 'F': 'f', 'S': 's'}
stemhebbend = {'B': 'b', 'D': 'd', 'V': 'v', 'Z': 'z', 'G': 'g>',
'H': 'h_', 'L': 'l', 'R': 'r', 'M': 'm', 'N': 'n',
'W': 'v_'}
verstemlozing = {'V': 'f', 'Z': 's', 'G': 'x', 'D': 't'}
verstemhebbing = {'F': 'v', 'S': 'z', 'P': 'b', 'T': 'd', 'K': 'g'}
klinkers = {'A': 'a>', 'E': 'e>', 'I': 'i>', 'O': 'o>', 'U': 'y>'}
for l_idx, letter in enumerate(source):
if letter in klinkers:
source = source[:l_idx] + "+" + source[l_idx+1:]
if letter == 'E' and l_idx == len(source)-1: # 'E' at the end of a word becomes an 'uh' sound
klank_fon = 'y>'
update_representations(representations, letter, 1, klank_fon)
else:
klank_fon = klinkers[letter]
# Klinker zonder dubbele medeklinker wordt langer geluid
if l_idx + 2 < len(source):
if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \
and not(source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend):
klank_fon = klank_fon[0] + ':'
update_representations(representations, letter, 1, klank_fon)
for l_idx, letter in enumerate(source):
if letter in stemloos and l_idx+1 < len(source) and source[l_idx+1] in stemhebbend\
and letter in verstemhebbing:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = verstemhebbing[letter]
update_representations(representations, letter, 1, klank_fon)
if letter in stemhebbend and l_idx + 1 < len(source) and source[l_idx + 1] in stemloos\
and letter in verstemlozing:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = verstemlozing[letter]
update_representations(representations, letter, 1, klank_fon)
if letter in stemloos:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = stemloos[letter]
update_representations(representations, letter, 1, klank_fon)
if letter in stemhebbend:
source = source[:l_idx] + "+" + source[l_idx + 1:]
if l_idx == len(source)-1 and letter in verstemlozing: # eindklankverscherping
class DutchPhonetics:
@staticmethod
def compare(word1, word2):
word1_representations, word2_representations = DutchPhonetics.mmetaphone(word1), \
DutchPhonetics.mmetaphone(word2)
return any(w1_rep in word2_representations for w1_rep in word1_representations)
@staticmethod
def mmetaphone(source):
source = source.upper()
working_source = copy.deepcopy(source) # Create a copy that we can manipulate
representations = []
structure_rule_sets = [DutchPhonetics.derden, DutchPhonetics.dubbelen, DutchPhonetics.enkelen]
for rule_set in structure_rule_sets:
working_source, representations = rule_set(working_source, representations)
return representations
@staticmethod
def derden(source, representations):
uitzonderingen = {'AIL': 'a>i', 'TSJ': 'ts>'}
uz_len = 3
new_rep = copy.deepcopy(source) # New representation without ambiguities yet
for uz in uitzonderingen:
uz_pos = source.find(uz)
while uz_pos != -1:
new_rep = new_rep[:uz_pos] + uitzonderingen[uz] + new_rep[uz_pos+uz_len:]
source = source[:uz_pos] + "+" + source[uz_pos+uz_len:] # '+' : indicates removal of recognized piece
uz_pos = source.find(uz)
#print(source, new_rep)
if new_rep not in representations:
representations.append(new_rep)
return source, representations
@staticmethod
def dubbelen(source, representations):
dubbel_tweeklank = {'AA': 'a:', 'EE': 'e:', 'IE': 'i', 'OO': 'o:', 'OE': 'u'}
d_len = 2
for dtk in dubbel_tweeklank:
dtk_pos = source.find(dtk)
while dtk_pos != -1:
if dtk in {'AA', 'OO', 'OE'} and dtk_pos+d_len < len(source) and source[dtk_pos+d_len] == 'I':
source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 1:]
klank_fon = dubbel_tweeklank[dtk]
DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, 'i', 1)
if dtk in {'EE', 'IE'} and dtk_pos+d_len+1 < len(source) and source[dtk_pos+d_len:dtk_pos+d_len+2] == 'UW':
source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + 2:]
klank_fon = dubbel_tweeklank[dtk]
DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, 'u', 2)
else:
source = source[:dtk_pos] + "+" + source[dtk_pos + d_len:]
klank_fon = dubbel_tweeklank[dtk]
DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon)
dtk_pos = source.find(dtk)
dubbel = {'UU': 'y', 'EU': '2:', 'OE': 'u', 'UE': 'u:', 'IJ': 'e>i', 'EI': 'e>i',
'UI': '9y', 'OU': 'v>u', 'AU': 'v>u', 'SJ': 's>', 'NG': 'n>', 'DT': 't',
'TH': 't'}
for d in dubbel:
d_pos = source.find(d)
while d_pos != -1:
source = source[:d_pos] + "+" + source[d_pos + d_len:]
klank_fon = dubbel_tweeklank[d]
DutchPhonetics.update_representations(representations, d, d_len, klank_fon)
d_pos = source.find(d)
# e.g. CH kan 3 soorten klanken worden, afhankelijk van of het stemloos, stemhebbend of een
# leen woord als 'chef' is.
ambigu = {'CH': ('x', 'g>', 's>')}
for amb in ambigu:
amb_pos = source.find(amb)
while amb_pos != -1:
source = source[:amb_pos] + "+" + source[amb_pos + d_len:]
amb_klanken = ambigu[amb]
representations = DutchPhonetics.update_rep_ambiguous(representations, amb, d_len, amb_klanken)
amb_pos = source.find(amb)
#print(source, representations)
return source, representations
@staticmethod
def enkelen(source, representations):
stemloos = {'P': 'p', 'T': 't', 'K': 'k', 'F': 'f', 'S': 's'}
stemhebbend = {'B': 'b', 'D': 'd', 'V': 'v', 'Z': 'z', 'G': 'g>',
'H': 'h_', 'L': 'l', 'R': 'r', 'M': 'm', 'N': 'n',
'W': 'v_'}
verstemlozing = {'V': 'f', 'Z': 's', 'G': 'x', 'D': 't'}
verstemhebbing = {'F': 'v', 'S': 'z', 'P': 'b', 'T': 'd', 'K': 'g'}
klinkers = {'A': 'a>', 'E': 'e>', 'I': 'i>', 'O': 'o>', 'U': 'y>'}
# Deal only with klinkers
for l_idx, letter in enumerate(source):
if letter in klinkers:
source = source[:l_idx] + "+" + source[l_idx+1:]
if letter == 'E' and l_idx == len(source)-1: # 'E' at the end of a word becomes an 'uh' sound
klank_fon = 'y>'
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
else:
klank_fon = klinkers[letter]
# Klinker zonder dubbele medeklinker wordt langer geluid
if l_idx + 2 < len(source):
if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \
and not(source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend):
klank_fon = klank_fon[0] + ':'
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
# Deal only with medeklinkers
for l_idx, letter in enumerate(source):
if letter in stemloos and l_idx+1 < len(source) and source[l_idx+1] in stemhebbend\
and letter in verstemhebbing:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = verstemhebbing[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
if letter in stemhebbend and l_idx + 1 < len(source) and source[l_idx + 1] in stemloos\
and letter in verstemlozing:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = verstemlozing[letter]
else:
klank_fon = stemhebbend[letter]
update_representations(representations, letter, 1, klank_fon)
# e.g. journaal & jas
ambigu = {'J': ('z>', 'j', 'dz>'), 'X': ('ks', 'z')}
for amb in ambigu:
amb_pos = source.find(amb)
if amb_pos != -1:
source = source[:amb_pos] + "+" + source[amb_pos + 1:]
amb_klanken = ambigu[amb]
representations = update_rep_ambiguous(representations, amb, 1, amb_klanken)
print(source, representations)
return source, representations
def update_rep_ambiguous(representations, amb_spelling, spel_len, amb_klanken):
all_new_reps = []
for amb_klank in amb_klanken:
n_representations = copy.deepcopy(representations)
update_representations(n_representations, amb_spelling, spel_len, amb_klank)
all_new_reps.extend(n_representations)
return all_new_reps
def update_representations(representations, spelling, spel_len, klank, additie="", additie_len=0):
for rep_idx, rep in enumerate(representations):
spelling_pos = rep.find(spelling)
if spelling_pos != -1:
fon_rep = klank + additie
tot_len = spel_len + additie_len
representations[rep_idx] = rep[:spelling_pos] + fon_rep + rep[spelling_pos + tot_len:]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
if letter in stemloos:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = stemloos[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
if letter in stemhebbend:
source = source[:l_idx] + "+" + source[l_idx + 1:]
if l_idx == len(source)-1 and letter in verstemlozing: # eindklankverscherping
klank_fon = verstemlozing[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
else:
klank_fon = stemhebbend[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
# e.g. journaal & jas
ambigu = {'J': ('z>', 'j', 'dz>'), 'X': ('ks', 'z')}
for amb in ambigu:
amb_pos = source.find(amb)
while amb_pos != -1:
source = source[:amb_pos] + "+" + source[amb_pos + 1:]
amb_klanken = ambigu[amb]
representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken)
amb_pos = source.find(amb)
# letters met diakritische tekens. Tekens in NL: {accent aigu : é, accent grave: è, trema: ë,
# umlaut: ü, accent circonflexe: ê, cedille: ç}
# Not all possibilities are in here because not everything can be written in Python here
# So these have been excluded
# Cedille has only 2 letters associated and they are used very infrequently so its not included
aigu = {'Á': 'a:', 'É': 'e:', 'Í': 'i', 'Ó': 'o:', 'Ú': 'y', 'Ý': 'e>i'}
grave_circonflexe = {'À': 'a>', 'È': 'e>', 'Ì': 'i>', 'Ò': 'o>', 'Ù': 'y>',
'Â': 'a>', 'Ê': 'e>', 'Î': 'i>', 'Ô': 'o>', 'Û': 'y>'}
trema = {'Ë': 'e>', 'Ï': 'i>'}
overig = {'Y': 'e>i', 'Q': 'k'}
for l_idx, letter in enumerate(source):
if letter in aigu:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = aigu[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
elif letter in grave_circonflexe:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = grave_circonflexe[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
elif letter in trema:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = trema[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
elif letter != "+":
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = overig[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
trema_umlaut = {'Ä': ('a>', 'e>'), 'Ö': ('o>', '2:'), 'Ü': ('y>', 'y')} # Heavily irregular symbol
for amb in trema_umlaut:
amb_pos = source.find(amb)
while amb_pos != -1:
source = source[:amb_pos] + "+" + source[amb_pos + 1:]
amb_klanken = ambigu[amb]
representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken)
amb_pos = source.find(amb)
print(source, representations)
return source, representations
@staticmethod
def update_rep_ambiguous(representations, amb_spelling, spel_len, amb_klanken):
all_new_reps = []
for amb_klank in amb_klanken:
n_representations = copy.deepcopy(representations)
DutchPhonetics.update_representations(n_representations, amb_spelling, spel_len, amb_klank)
all_new_reps.extend(n_representations)
return all_new_reps
@staticmethod
def update_representations(representations, spelling, spel_len, klank, additie="", additie_len=0):
for rep_idx, rep in enumerate(representations):
spelling_pos = rep.find(spelling)
if spelling_pos != -1:
fon_rep = klank + additie
tot_len = spel_len + additie_len
representations[rep_idx] = rep[:spelling_pos] + fon_rep + rep[spelling_pos + tot_len:]
# Test examples
mmetaphone("Detail")
mmetaphone("Haai")
mmetaphone("Leeuw")
mmetaphone("Lach")
mmetaphone('Jazz') # leenwoord, representatie kan niet 100% worden opgevangen tenzij als special case
mmetaphone("Handvat") # 'dv' does not become a 'tf' sound yet
mmetaphone("Hand")
mmetaphone("Weggelopen")
DutchPhonetics.mmetaphone("Detail")
DutchPhonetics.mmetaphone("Haai")
DutchPhonetics.mmetaphone("Leeuw")
DutchPhonetics.mmetaphone("Lach")
DutchPhonetics.mmetaphone('Jazz') # leenwoord, representatie kan niet 100% worden opgevangen tenzij als special case
DutchPhonetics.mmetaphone("Handvat") # 'dv' does not become a 'tf' sound yet
DutchPhonetics.mmetaphone("Hand")
DutchPhonetics.mmetaphone("Weggelopen")
# 'gg' does not become a 'xg>' sound yet and because there is only a single medeklinker after, the second 'e' becomes
# an 'e:' sound, should be more like a 'y' sound but question is whether 'ge' is a pattern often enough such that it
# can be used a special case of 2 letters.
mmetaphone("Pet")
mmetaphone("Petten")
mmetaphone("Peter")
DutchPhonetics.mmetaphone("Pet")
DutchPhonetics.mmetaphone("Petten")
DutchPhonetics.mmetaphone("Peter")
DutchPhonetics.mmetaphone("Feeën")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment