Revert "Merge branch 'master' of gitlab.socsci.ru.nl:msdt/team1920-speechcomparison"

This reverts commit 2f48bf24

Revert "Merge branch 'master' of gitlab.socsci.ru.nl:msdt/team1920-speechcomparison"
This reverts commit 2f48bf24
5c9ab427 · Alfen, T. van (Tanja) · 2f48bf24 · 2f48bf24
Commit 5c9ab427 authored 5 years ago by Alfen, T. van (Tanja)
--- a/umbra/dutch_mmetaphone.py
+++ b/umbra/dutch_mmetaphone.py
-"""This is a new version of the previous algorithm
-   It is not a 'multiple' metaphone algorithm for
-   the Dutch language
-   It now works with rule set for structures at
-   certain levels.
-   The phonetic representation is an adapted form
-  of SAMPA. """
-# Replacements: A : a>, S : s>, N : n>, h\ : h_, v\ : v_
-# '+' : indicates removal of recognized pattern
-import copy
-
-
-class DutchPhonetics:
-    @staticmethod
-    def compare(word1, word2):
-        """"Compare the phonetic representations of 2 source strings and see
-        if one of them matches.
-
-        Args: word1 : string
-              word2: string"""
-        word1_representations, word2_representations = DutchPhonetics.mmetaphone(word1), \
-                                                       DutchPhonetics.mmetaphone(word2)
-        return any(w1_rep in word2_representations for w1_rep in word1_representations)
-
-    @staticmethod
-    def mmetaphone(source):
-        """"Convert a source string to its possible phonetic representations."""
-        source = source.upper()
-        working_source = copy.deepcopy(source)  # Create a copy that we can manipulate
-        representations = []
-
-        structure_rule_sets = [DutchPhonetics.derden, DutchPhonetics.dubbelen, DutchPhonetics.enkelen]
-
-        for rule_set in structure_rule_sets:
-            working_source, representations = rule_set(working_source, representations)
-
-        return representations
-
-    @staticmethod
-    def derden(source, representations):
-        """"Recognise 3 letter patterns in the source and convert them to their
-        phonetic representation. """
-        uitzonderingen = {'AIL': 'a>i', 'TSJ': 'ts>'}
-        uz_len = 3
-        new_rep = copy.deepcopy(source)  # New representation without ambiguities yet
-
-        for uz in uitzonderingen:
-            uz_pos = source.find(uz)
-            while uz_pos != -1:
-                new_rep = new_rep[:uz_pos] + uitzonderingen[uz] + new_rep[uz_pos+uz_len:]
-                source = source[:uz_pos] + "+" + source[uz_pos+uz_len:]
-                uz_pos = source.find(uz)
-
-        if new_rep not in representations:
-            representations.append(new_rep)
-
-        return source, representations
-
-    @staticmethod
-    def dubbelen(source, representations):
-        """"Recognise 2 letter patterns in the source and convert them to their
-        phonetic representation. """
-        dubbel_tweeklank = {'AA': 'a:', 'EE': 'e:', 'IE': 'i', 'OO': 'o:', 'OE': 'u'}
-        d_len = 2
-
-        for dtk in dubbel_tweeklank:
-            dtk_pos = source.find(dtk)
-            add_len = 0
-            addition = ''
-            while dtk_pos != -1:
-                klank_fon = dubbel_tweeklank[dtk]
-
-                if dtk in {'AA', 'OO', 'OE'} and dtk_pos+d_len < len(source) \
-                        and source[dtk_pos+d_len] == 'I':
-                    addition = 'i'
-                    add_len = 1
-
-                if dtk in {'EE', 'IE'} and dtk_pos+d_len+1 < len(source) \
-                        and source[dtk_pos+d_len:dtk_pos+d_len+2] == 'UW':
-                    addition = 'u'
-                    add_len = 2
-
-                DutchPhonetics.update_representations(representations, dtk, d_len, klank_fon, addition, add_len)
-                source = source[:dtk_pos] + "+" + source[dtk_pos + d_len + add_len:]
-                dtk_pos = source.find(dtk)
-
-        dubbel = {'UU': 'y', 'EU': '2:', 'OE': 'u', 'UE': 'u:', 'IJ': 'e>i', 'EI': 'e>i',
-                  'UI': '9y', 'OU': 'v>u', 'AU': 'v>u', 'SJ': 's>', 'NG': 'n>', 'DT': 't',
-                  'TH': 't'}
-
-        for d in dubbel:
-            d_pos = source.find(d)
-            while d_pos != -1:
-                DutchPhonetics.update_representations(representations, d, d_len, dubbel[d])
-                source = source[:d_pos] + "+" + source[d_pos + d_len:]
-                d_pos = source.find(d)
-
-        # e.g. CH can have 3 types of sound, depending on whether it is voiced, unvoiced or a word
-        # borrowed from another language. e.g. chef
-        ambigu = {'CH': ('x', 'g>', 's>')}
-
-        for amb in ambigu:
-            amb_pos = source.find(amb)
-            while amb_pos != -1:
-                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, d_len, ambigu[amb])
-                source = source[:amb_pos] + "+" + source[amb_pos + d_len:]
-                amb_pos = source.find(amb)
-
-        return source, representations
-
-    @staticmethod
-    def enkelen(source, representations):
-        """"Recognise single letter patterns in the source and convert them to their
-        phonetic representation. """
-
-        stemloos = {'P': 'p', 'T': 't', 'K': 'k', 'F': 'f', 'S': 's'}
-        stemhebbend = {'B': 'b', 'D': 'd', 'V': 'v', 'Z': 'z', 'G': 'g>',
-                       'H': 'h_', 'L': 'l', 'R': 'r', 'M': 'm', 'N': 'n',
-                       'W': 'v_'}
-
-        verstemlozing = {'V': 'f', 'Z': 's', 'G': 'x', 'D': 't'}
-        verstemhebbing = {'F': 'v', 'S': 'z', 'P': 'b', 'T': 'd', 'K': 'g'}
-
-        klinkers = {'A': 'a>', 'E': 'e>', 'I': 'i>', 'O': 'o>', 'U': 'y>'}
-
-        # Deal only with klinkers
-        for l_idx, letter in enumerate(source):
-            if letter in klinkers:
-                if letter == 'E' and l_idx == len(source)-1:  # 'E' at the end of a word becomes an 'uh' sound
-                    klank_fon = 'y>'
-                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-                else:
-                    klank_fon = klinkers[letter]
-                    # Klinker without double medeklinker becomes a longer sound
-                    if l_idx + 2 < len(source):
-                        if (source[l_idx+1] in stemloos or source[l_idx+1] in stemhebbend) \
-                                and not (source[l_idx+2] in stemloos or source[l_idx+2] in stemhebbend):
-                            klank_fon = klank_fon[0] + ':'
-                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-                source = source[:l_idx] + "+" + source[l_idx + 1:]
-
-        # Deal only with medeklinkers
-        for l_idx, letter in enumerate(source):
-            if letter in stemloos and l_idx+1 < len(source) and source[l_idx+1] in stemhebbend\
-                    and letter in verstemhebbing:
-                source = source[:l_idx] + "+" + source[l_idx + 1:]
-                klank_fon = verstemhebbing[letter]
-                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-
-            if letter in stemhebbend and l_idx + 1 < len(source) and source[l_idx + 1] in stemloos\
-                    and letter in verstemlozing:
-                source = source[:l_idx] + "+" + source[l_idx + 1:]
-                klank_fon = verstemlozing[letter]
-                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-
-            if letter in stemloos:
-                source = source[:l_idx] + "+" + source[l_idx + 1:]
-                klank_fon = stemloos[letter]
-                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-
-            if letter in stemhebbend:
-                source = source[:l_idx] + "+" + source[l_idx + 1:]
-                if l_idx == len(source)-1 and letter in verstemlozing:  # final devoicing
-                    klank_fon = verstemlozing[letter]
-                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-                else:
-                    klank_fon = stemhebbend[letter]
-                    DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-
-        # e.g. journaal & jas or xanten & xenofobie
-        ambigu = {'J': ('z>', 'j', 'dz>'), 'X': ('ks', 'z')}
-        for amb in ambigu:
-            amb_pos = source.find(amb)
-            while amb_pos != -1:
-                source = source[:amb_pos] + "+" + source[amb_pos + 1:]
-                amb_klanken = ambigu[amb]
-                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, amb_klanken)
-                amb_pos = source.find(amb)
-
-        """ Letters with  diacritic signs. Sign in Dutch: {accent aigu : é, accent grave: è, trema: ë,
-            umlaut: ü, accent circonflexe: ê, cedille: ç}
-         Not all possibilities are in here because not everything can be written in Python. 
-         So these have been excluded.
-         Cedille has only 2 letters associated and they are used very infrequently so its not included """
-        aigu = {'Á': 'a:', 'É': 'e:', 'Í': 'i', 'Ó': 'o:', 'Ú': 'y', 'Ý': 'e>i'}
-        grave_circonflexe = {'À': 'a>', 'È': 'e>', 'Ì': 'i>', 'Ò': 'o>', 'Ù': 'y>',
-                             'Â': 'a>', 'Ê': 'e>', 'Î': 'i>', 'Ô': 'o>', 'Û': 'y>'}
-        trema = {'Ë': 'e>', 'Ï': 'i>'}
-        overig = {'Y': 'e>i', 'Q': 'k'}
-        for l_idx, letter in enumerate(source):
-            if letter in aigu:
-                source = source[:l_idx] + "+" + source[l_idx + 1:]
-                klank_fon = aigu[letter]
-                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-            elif letter in grave_circonflexe:
-                source = source[:l_idx] + "+" + source[l_idx + 1:]
-                klank_fon = grave_circonflexe[letter]
-                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-            elif letter in trema:
-                source = source[:l_idx] + "+" + source[l_idx + 1:]
-                klank_fon = trema[letter]
-                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-            elif letter != "+" and letter in overig:
-                source = source[:l_idx] + "+" + source[l_idx + 1:]
-                klank_fon = overig[letter]
-                DutchPhonetics.update_representations(representations, letter, 1, klank_fon)
-
-        trema_umlaut = {'Ä': ('a>', 'e>'), 'Ö': ('o>', '2:'), 'Ü': ('y>', 'y')}  # Heavily irregular symbol
-        for amb in trema_umlaut:
-            amb_pos = source.find(amb)
-            while amb_pos != -1:
-                representations = DutchPhonetics.update_rep_ambiguous(representations, amb, 1, ambigu[amb])
-                source = source[:amb_pos] + "+" + source[amb_pos + 1:]
-                amb_pos = source.find(amb)
-
-        print(source, representations)
-
-        return source, representations
-
-    @staticmethod
-    def update_rep_ambiguous(representations, amb_spelling, spel_len, amb_klanken):
-        all_new_reps = []
-        for amb_klank in amb_klanken:
-            n_representations = copy.deepcopy(representations)
-            DutchPhonetics.update_representations(n_representations, amb_spelling, spel_len, amb_klank)
-            all_new_reps.extend(n_representations)
-
-        return all_new_reps
-
-    @staticmethod
-    def update_representations(representations, spelling, spel_len, klank, additie="", additie_len=0):
-        for rep_idx, rep in enumerate(representations):
-            spelling_pos = rep.find(spelling)
-            if spelling_pos != -1:
-                fon_rep = klank + additie
-                tot_len = spel_len + additie_len
-                representations[rep_idx] = rep[:spelling_pos] + fon_rep + rep[spelling_pos + tot_len:]