Skip to content
Snippets Groups Projects
Commit 5c9ab427 authored by Alfen, T. van (Tanja)'s avatar Alfen, T. van (Tanja)
Browse files

Revert "Merge branch 'master' of"

This reverts commit 2f48bf24
parent 2f48bf24
No related branches found
No related tags found
No related merge requests found
"""This is a new version of the previous algorithm
It is not a 'multiple' metaphone algorithm for
the Dutch language
It now works with rule set for structures at
certain levels.
The phonetic representation is an adapted form
of SAMPA. """
# Replacements: A : a>, S : s>, N : n>, h\ : h_, v\ : v_
# '+' : indicates removal of recognized pattern
import copy
class DutchPhonetics:
def compare(word1, word2):
""""Compare the phonetic representations of 2 source strings and see
if one of them matches.
Args: word1 : string
word2: string"""
word1_representations, word2_representations = DutchPhonetics.mmetaphone(word1), \
return any(w1_rep in word2_representations for w1_rep in word1_representations)
def mmetaphone(source):
""""Convert a source string to its possible phonetic representations."""
source = source.upper()
working_source = copy.deepcopy(source) # Create a copy that we can manipulate
representations = []
structure_rule_sets = [DutchPhonetics.derden, DutchPhonetics.dubbelen, DutchPhonetics.enkelen]
for rule_set in structure_rule_sets:
working_source, representations = rule_set(working_source, representations)
return representations
def derden(source, representations):
""""Recognise 3 letter patterns in the source and convert them to their
phonetic representation. """
uitzonderingen = {'AIL': 'a>i', 'TSJ': 'ts>'}
uz_len = 3
new_rep = copy.deepcopy(source) # New representation without ambiguities yet
for uz in uitzonderingen:
uz_pos = source.find(uz)
while uz_pos != -1:
new_rep = new_rep[:uz_pos] + uitzonderingen[uz] + new_rep[uz_pos+uz_len:]
source = source[:uz_pos] + "+" + source[uz_pos+uz_len:]
uz_pos = source.find(uz)
if new_rep not in representations:
return source, representations
def dubbelen(source, representations):
""""Recognise 2 letter patterns in the source and convert them to their
phonetic representation. """
dubbel_tweeklank = {'AA': 'a:', 'EE': 'e:', 'IE': 'i', 'OO': 'o:', 'OE': 'u'}
d_len = 2
for dtk in dubbel_tweeklank:
dtk_pos = source.find(dtk)
add_len = 0
addition = ''
while dtk_pos != -1:
klank_fon = dubbel_tweeklank[dtk]
if dtk in {'AA', 'OO', 'OE'} and dtk_pos+d_len < len(source) \
and source[dtk_pos+d_len] == 'I':
addition = 'i'
add_len = 1
if dtk in {'EE', 'IE'} and dtk_pos+d_len+1 < len(source) \
and source[dtk_pos+d_len:dtk_pos+d_len+2] == 'UW':
addition = 'u'
add_len = 2
DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, addition, add_len)
source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + add_len:]
dtk_pos = source.find(dtk)
dubbel = {'UU': 'y', 'EU': '2:', 'OE': 'u', 'UE': 'u:', 'IJ': 'e>i', 'EI': 'e>i',
'UI': '9y', 'OU': 'v>u', 'AU': 'v>u', 'SJ': 's>', 'NG': 'n>', 'DT': 't',
'TH': 't'}
for d in dubbel:
d_pos = source.find(d)
while d_pos != -1:
DutchPhonetics.update_representations(representations, d, d_len, dubbel[d])
source = source[:d_pos] + "+" + source[d_pos + d_len:]
d_pos = source.find(d)
# e.g. CH can have 3 types of sound, depending on whether it is voiced, unvoiced or a word
# borrowed from another language. e.g. chef
ambigu = {'CH': ('x', 'g>', 's>')}
for amb in ambigu:
amb_pos = source.find(amb)
while amb_pos != -1:
representations = DutchPhonetics.update_rep_ambiguous(representations, amb, d_len, ambigu[amb])
source = source[:amb_pos] + "+" + source[amb_pos + d_len:]
amb_pos = source.find(amb)
return source, representations
def enkelen(source, representations):
""""Recognise single letter patterns in the source and convert them to their
phonetic representation. """
stemloos = {'P': 'p', 'T': 't', 'K': 'k', 'F': 'f', 'S': 's'}
stemhebbend = {'B': 'b', 'D': 'd', 'V': 'v', 'Z': 'z', 'G': 'g>',
'H': 'h_', 'L': 'l', 'R': 'r', 'M': 'm', 'N': 'n',
'W': 'v_'}
verstemlozing = {'V': 'f', 'Z': 's', 'G': 'x', 'D': 't'}
verstemhebbing = {'F': 'v', 'S': 'z', 'P': 'b', 'T': 'd', 'K': 'g'}
klinkers = {'A': 'a>', 'E': 'e>', 'I': 'i>', 'O': 'o>', 'U': 'y>'}
# Deal only with klinkers
for l_idx, letter in enumerate(source):
if letter in klinkers:
if letter == 'E' and l_idx == len(source)-1: # 'E' at the end of a word becomes an 'uh' sound
klank_fon = 'y>'
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
klank_fon = klinkers[letter]
# Klinker without double medeklinker becomes a longer sound
if l_idx + 2 < len(source):
if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \
and not (source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend):
klank_fon = klank_fon[0] + ':'
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
source = source[:l_idx] + "+" + source[l_idx + 1:]
# Deal only with medeklinkers
for l_idx, letter in enumerate(source):
if letter in stemloos and l_idx+1 < len(source) and source[l_idx+1] in stemhebbend\
and letter in verstemhebbing:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = verstemhebbing[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
if letter in stemhebbend and l_idx + 1 < len(source) and source[l_idx + 1] in stemloos\
and letter in verstemlozing:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = verstemlozing[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
if letter in stemloos:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = stemloos[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
if letter in stemhebbend:
source = source[:l_idx] + "+" + source[l_idx + 1:]
if l_idx == len(source)-1 and letter in verstemlozing: # final devoicing
klank_fon = verstemlozing[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
klank_fon = stemhebbend[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
# e.g. journaal & jas or xanten & xenofobie
ambigu = {'J': ('z>', 'j', 'dz>'), 'X': ('ks', 'z')}
for amb in ambigu:
amb_pos = source.find(amb)
while amb_pos != -1:
source = source[:amb_pos] + "+" + source[amb_pos + 1:]
amb_klanken = ambigu[amb]
representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken)
amb_pos = source.find(amb)
""" Letters with diacritic signs. Sign in Dutch: {accent aigu : é, accent grave: è, trema: ë,
umlaut: ü, accent circonflexe: ê, cedille: ç}
Not all possibilities are in here because not everything can be written in Python.
So these have been excluded.
Cedille has only 2 letters associated and they are used very infrequently so its not included """
aigu = {'Á': 'a:', 'É': 'e:', 'Í': 'i', 'Ó': 'o:', 'Ú': 'y', 'Ý': 'e>i'}
grave_circonflexe = {'À': 'a>', 'È': 'e>', 'Ì': 'i>', 'Ò': 'o>', 'Ù': 'y>',
'Â': 'a>', 'Ê': 'e>', 'Î': 'i>', 'Ô': 'o>', 'Û': 'y>'}
trema = {'Ë': 'e>', 'Ï': 'i>'}
overig = {'Y': 'e>i', 'Q': 'k'}
for l_idx, letter in enumerate(source):
if letter in aigu:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = aigu[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
elif letter in grave_circonflexe:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = grave_circonflexe[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
elif letter in trema:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = trema[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
elif letter != "+" and letter in overig:
source = source[:l_idx] + "+" + source[l_idx + 1:]
klank_fon = overig[letter]
DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
trema_umlaut = {'Ä': ('a>', 'e>'), 'Ö': ('o>', '2:'), 'Ü': ('y>', 'y')} # Heavily irregular symbol
for amb in trema_umlaut:
amb_pos = source.find(amb)
while amb_pos != -1:
representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, ambigu[amb])
source = source[:amb_pos] + "+" + source[amb_pos + 1:]
amb_pos = source.find(amb)
print(source, representations)
return source, representations
def update_rep_ambiguous(representations, amb_spelling, spel_len, amb_klanken):
all_new_reps = []
for amb_klank in amb_klanken:
n_representations = copy.deepcopy(representations)
DutchPhonetics.update_representations(n_representations, amb_spelling, spel_len, amb_klank)
return all_new_reps
def update_representations(representations, spelling, spel_len, klank, additie="", additie_len=0):
for rep_idx, rep in enumerate(representations):
spelling_pos = rep.find(spelling)
if spelling_pos != -1:
fon_rep = klank + additie
tot_len = spel_len + additie_len
representations[rep_idx] = rep[:spelling_pos] + fon_rep + rep[spelling_pos + tot_len:]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment