From 5c9ab427426bc5cbcfdf1bf79dac6e856a1aba4c Mon Sep 17 00:00:00 2001 From: "Alfen, T. van (Tanja)" <t.vanalfen@student.ru.nl> Date: Thu, 5 Dec 2019 10:35:38 +0100 Subject: [PATCH] Revert "Merge branch 'master' of gitlab.socsci.ru.nl:msdt/team1920-speechcomparison" This reverts commit 2f48bf2497b365d25bad416b320c4df98e0dd47e --- umbra/dutch_mmetaphone.py | 237 -------------------------------------- 1 file changed, 237 deletions(-) delete mode 100644 umbra/dutch_mmetaphone.py diff --git a/umbra/dutch_mmetaphone.py b/umbra/dutch_mmetaphone.py deleted file mode 100644 index 6a3d1cba..00000000 --- a/umbra/dutch_mmetaphone.py +++ /dev/null @@ -1,237 +0,0 @@ -"""This is a new version of the previous algorithm - It is not a 'multiple' metaphone algorithm for - the Dutch language - It now works with rule set for structures at - certain levels. - The phonetic representation is an adapted form - of SAMPA. """ -# Replacements: A : a>, S : s>, N : n>, h\ : h_, v\ : v_ -# '+' : indicates removal of recognized pattern -import copy - - -class DutchPhonetics: - @staticmethod - def compare(word1, word2): - """"Compare the phonetic representations of 2 source strings and see - if one of them matches. - - Args: word1 : string - word2: string""" - word1_representations, word2_representations = DutchPhonetics.mmetaphone(word1), \ - DutchPhonetics.mmetaphone(word2) - return any(w1_rep in word2_representations for w1_rep in word1_representations) - - @staticmethod - def mmetaphone(source): - """"Convert a source string to its possible phonetic representations.""" - source = source.upper() - working_source = copy.deepcopy(source) # Create a copy that we can manipulate - representations = [] - - structure_rule_sets = [DutchPhonetics.derden, DutchPhonetics.dubbelen, DutchPhonetics.enkelen] - - for rule_set in structure_rule_sets: - working_source, representations = rule_set(working_source, representations) - - return representations - - @staticmethod - def derden(source, representations): - """"Recognise 3 letter patterns in the source and convert them to their - phonetic representation. """ - uitzonderingen = {'AIL': 'a>i', 'TSJ': 'ts>'} - uz_len = 3 - new_rep = copy.deepcopy(source) # New representation without ambiguities yet - - for uz in uitzonderingen: - uz_pos = source.find(uz) - while uz_pos != -1: - new_rep = new_rep[:uz_pos] + uitzonderingen[uz] + new_rep[uz_pos+uz_len:] - source = source[:uz_pos] + "+" + source[uz_pos+uz_len:] - uz_pos = source.find(uz) - - if new_rep not in representations: - representations.append(new_rep) - - return source, representations - - @staticmethod - def dubbelen(source, representations): - """"Recognise 2 letter patterns in the source and convert them to their - phonetic representation. """ - dubbel_tweeklank = {'AA': 'a:', 'EE': 'e:', 'IE': 'i', 'OO': 'o:', 'OE': 'u'} - d_len = 2 - - for dtk in dubbel_tweeklank: - dtk_pos = source.find(dtk) - add_len = 0 - addition = '' - while dtk_pos != -1: - klank_fon = dubbel_tweeklank[dtk] - - if dtk in {'AA', 'OO', 'OE'} and dtk_pos+d_len < len(source) \ - and source[dtk_pos+d_len] == 'I': - addition = 'i' - add_len = 1 - - if dtk in {'EE', 'IE'} and dtk_pos+d_len+1 < len(source) \ - and source[dtk_pos+d_len:dtk_pos+d_len+2] == 'UW': - addition = 'u' - add_len = 2 - - DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, addition, add_len) - source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + add_len:] - dtk_pos = source.find(dtk) - - dubbel = {'UU': 'y', 'EU': '2:', 'OE': 'u', 'UE': 'u:', 'IJ': 'e>i', 'EI': 'e>i', - 'UI': '9y', 'OU': 'v>u', 'AU': 'v>u', 'SJ': 's>', 'NG': 'n>', 'DT': 't', - 'TH': 't'} - - for d in dubbel: - d_pos = source.find(d) - while d_pos != -1: - DutchPhonetics.update_representations(representations, d, d_len, dubbel[d]) - source = source[:d_pos] + "+" + source[d_pos + d_len:] - d_pos = source.find(d) - - # e.g. CH can have 3 types of sound, depending on whether it is voiced, unvoiced or a word - # borrowed from another language. e.g. chef - ambigu = {'CH': ('x', 'g>', 's>')} - - for amb in ambigu: - amb_pos = source.find(amb) - while amb_pos != -1: - representations = DutchPhonetics.update_rep_ambiguous(representations, amb, d_len, ambigu[amb]) - source = source[:amb_pos] + "+" + source[amb_pos + d_len:] - amb_pos = source.find(amb) - - return source, representations - - @staticmethod - def enkelen(source, representations): - """"Recognise single letter patterns in the source and convert them to their - phonetic representation. """ - - stemloos = {'P': 'p', 'T': 't', 'K': 'k', 'F': 'f', 'S': 's'} - stemhebbend = {'B': 'b', 'D': 'd', 'V': 'v', 'Z': 'z', 'G': 'g>', - 'H': 'h_', 'L': 'l', 'R': 'r', 'M': 'm', 'N': 'n', - 'W': 'v_'} - - verstemlozing = {'V': 'f', 'Z': 's', 'G': 'x', 'D': 't'} - verstemhebbing = {'F': 'v', 'S': 'z', 'P': 'b', 'T': 'd', 'K': 'g'} - - klinkers = {'A': 'a>', 'E': 'e>', 'I': 'i>', 'O': 'o>', 'U': 'y>'} - - # Deal only with klinkers - for l_idx, letter in enumerate(source): - if letter in klinkers: - if letter == 'E' and l_idx == len(source)-1: # 'E' at the end of a word becomes an 'uh' sound - klank_fon = 'y>' - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - else: - klank_fon = klinkers[letter] - # Klinker without double medeklinker becomes a longer sound - if l_idx + 2 < len(source): - if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \ - and not (source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend): - klank_fon = klank_fon[0] + ':' - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - source = source[:l_idx] + "+" + source[l_idx + 1:] - - # Deal only with medeklinkers - for l_idx, letter in enumerate(source): - if letter in stemloos and l_idx+1 < len(source) and source[l_idx+1] in stemhebbend\ - and letter in verstemhebbing: - source = source[:l_idx] + "+" + source[l_idx + 1:] - klank_fon = verstemhebbing[letter] - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - - if letter in stemhebbend and l_idx + 1 < len(source) and source[l_idx + 1] in stemloos\ - and letter in verstemlozing: - source = source[:l_idx] + "+" + source[l_idx + 1:] - klank_fon = verstemlozing[letter] - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - - if letter in stemloos: - source = source[:l_idx] + "+" + source[l_idx + 1:] - klank_fon = stemloos[letter] - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - - if letter in stemhebbend: - source = source[:l_idx] + "+" + source[l_idx + 1:] - if l_idx == len(source)-1 and letter in verstemlozing: # final devoicing - klank_fon = verstemlozing[letter] - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - else: - klank_fon = stemhebbend[letter] - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - - # e.g. journaal & jas or xanten & xenofobie - ambigu = {'J': ('z>', 'j', 'dz>'), 'X': ('ks', 'z')} - for amb in ambigu: - amb_pos = source.find(amb) - while amb_pos != -1: - source = source[:amb_pos] + "+" + source[amb_pos + 1:] - amb_klanken = ambigu[amb] - representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken) - amb_pos = source.find(amb) - - """ Letters with diacritic signs. Sign in Dutch: {accent aigu : é, accent grave: è, trema: ë, - umlaut: ü, accent circonflexe: ê, cedille: ç} - Not all possibilities are in here because not everything can be written in Python. - So these have been excluded. - Cedille has only 2 letters associated and they are used very infrequently so its not included """ - aigu = {'Ã': 'a:', 'É': 'e:', 'Ã': 'i', 'Ó': 'o:', 'Ú': 'y', 'Ã': 'e>i'} - grave_circonflexe = {'À': 'a>', 'È': 'e>', 'ÃŒ': 'i>', 'Ã’': 'o>', 'Ù': 'y>', - 'Â': 'a>', 'Ê': 'e>', 'ÃŽ': 'i>', 'Ô': 'o>', 'Û': 'y>'} - trema = {'Ë': 'e>', 'Ã': 'i>'} - overig = {'Y': 'e>i', 'Q': 'k'} - for l_idx, letter in enumerate(source): - if letter in aigu: - source = source[:l_idx] + "+" + source[l_idx + 1:] - klank_fon = aigu[letter] - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - elif letter in grave_circonflexe: - source = source[:l_idx] + "+" + source[l_idx + 1:] - klank_fon = grave_circonflexe[letter] - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - elif letter in trema: - source = source[:l_idx] + "+" + source[l_idx + 1:] - klank_fon = trema[letter] - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - elif letter != "+" and letter in overig: - source = source[:l_idx] + "+" + source[l_idx + 1:] - klank_fon = overig[letter] - DutchPhonetics.update_representations(representations, letter, 1, klank_fon) - - trema_umlaut = {'Ä': ('a>', 'e>'), 'Ö': ('o>', '2:'), 'Ãœ': ('y>', 'y')} # Heavily irregular symbol - for amb in trema_umlaut: - amb_pos = source.find(amb) - while amb_pos != -1: - representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, ambigu[amb]) - source = source[:amb_pos] + "+" + source[amb_pos + 1:] - amb_pos = source.find(amb) - - print(source, representations) - - return source, representations - - @staticmethod - def update_rep_ambiguous(representations, amb_spelling, spel_len, amb_klanken): - all_new_reps = [] - for amb_klank in amb_klanken: - n_representations = copy.deepcopy(representations) - DutchPhonetics.update_representations(n_representations, amb_spelling, spel_len, amb_klank) - all_new_reps.extend(n_representations) - - return all_new_reps - - @staticmethod - def update_representations(representations, spelling, spel_len, klank, additie="", additie_len=0): - for rep_idx, rep in enumerate(representations): - spelling_pos = rep.find(spelling) - if spelling_pos != -1: - fon_rep = klank + additie - tot_len = spel_len + additie_len - representations[rep_idx] = rep[:spelling_pos] + fon_rep + rep[spelling_pos + tot_len:] -- GitLab