diff --git a/umbra/form_checker.py b/umbra/form_checker.py index cf3583431b8fa1506f1efe90f8fe406509f89482..e5e823d7e75f8081e54c9fd78607f37df7385028 100644 --- a/umbra/form_checker.py +++ b/umbra/form_checker.py @@ -8,32 +8,38 @@ class FormChecker: """ Constructor """ - self.initialize_frames() - self._prefixes = ['','ge','be','ver','on','ont'] - self._affixes = ['','en','t','te','ten','de','den','s',"'s"] - self._verb_prefixes = ['ge', 'ver', 'be', 'ont'] + self._initialize_frames() + self._prefixes = ['', 'ge', 'be', 'ver', 'on', 'ont'] + self._affixes = ['', 'en', 't', 'te', 'ten', 'de', 'den', 's', "'s"] + self._verb_prefixes = ['ge', 'ver', 'be', 'ont', 'her', 'mis'] self._verb_affixes = ['den', 'ten', 'de', 'te', 'en', 't', 'd', 'n', ''] self._consonants = ['b', 'd', 'f', 'g', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'z', 'c', 'h', 'j', 'q', 'v', 'w', 'x'] self._vowels = ['a', 'e', 'o', 'u'] - def initialize_frames(self): + def _initialize_frames(self): """ Help the constructor by initializing the frames for irregular verbs. """ self._irr_verbs_frame = pd.read_csv(ut.get_path( 'resources/irregular_verbs.csv')) - self._irr_verbs_vte = self._irr_verbs_frame.copy() - self._irr_verbs_vtm = self._irr_verbs_frame.copy() - self._irr_verbs_vdw = self._irr_verbs_frame.copy() + self._irr_verbs_vte = self._create_frame('verleden tijd enkelvoud') + self._irr_verbs_vtm = self._create_frame('verleden tijd meervoud') + self._irr_verbs_vdw = self._create_frame('voltooid deelwoord') self._irr_verbs_frame = self._irr_verbs_frame.values - self._irr_verbs_vte = self._irr_verbs_vte.sort_values( - 'verleden tijd enkelvoud').values - self._irr_verbs_vtm = self._irr_verbs_vtm.sort_values( - 'verleden tijd meervoud').values - self._irr_verbs_vdw = self._irr_verbs_vdw.sort_values( - 'voltooid deelwoord').values + + def _create_frame(self, key): + """ + Create a new dataframe based on the key value given as parameter + + Args: + key: Key value for which column to sort the dataframe on. + + Returns: + A new dataframe sorted on the column corresponding to the key + """ + return self._irr_verbs_frame.copy().sort_values(key).values def form_related(self, source_word, shadow_word): """ @@ -46,11 +52,11 @@ class FormChecker: Returns: A boolean value indicating whether the arguments are form-related """ - return (self.fix_related(source_word, shadow_word) - or self.irr_verb_related(source_word, shadow_word) - or self.reg_verb_related(source_word, shadow_word)) + return self._fix_related(source_word, shadow_word) \ + or self._irr_verb_related(source_word, shadow_word) \ + or self._reg_verb_related(source_word, shadow_word) - def irr_verb_related(self, source_word, shadow_word): + def _irr_verb_related(self, source_word, shadow_word): """ Check whether the arguments are different versions of the same irregular verb. @@ -63,18 +69,13 @@ class FormChecker: A boolean value indicating whether the arguments are different versions of the same irregular verb. """ - return (self.irr_verb_helper(self._irr_verbs_frame, 0, - source_word, shadow_word) - or self.irr_verb_helper(self._irr_verbs_frame, 0, - source_word, shadow_word) - or self.irr_verb_helper(self._irr_verbs_vte, 1, - source_word, shadow_word) - or self.irr_verb_helper(self._irr_verbs_vtm, 2, - source_word, shadow_word) - or self.irr_verb_helper(self._irr_verbs_vdw, 3, - source_word, shadow_word)) + for key in range(4): + if self._irr_verb_helper(self._irr_verbs_frame, key, source_word, + shadow_word): + return True + return False - def irr_verb_helper(self, frame, key, source_word, shadow_word): + def _irr_verb_helper(self, frame, key, source_word, shadow_word): """ Search in one frame to see whether the shadow word and source word are in the same row. @@ -93,13 +94,35 @@ class FormChecker: i = bisect_left(frame[key], source_word) return i != len(frame[key]) and shadow_word in frame[i] - def reg_verb_related(self, source_word, shadow_word): - return self.reg_verb_helper(source_word, shadow_word) or \ - self.reg_verb_helper(shadow_word, source_word) + def _reg_verb_related(self, source_word, shadow_word): + """ + Check whether two words are different conjugations of the same verb - def reg_verb_helper(self, word1, word2): - stem = self.get_stem(word1) - vdw1, vdw2 = self.create_vdws(stem) + Args: + source_word: lemma of type string, corresponding to a source word + shadow_word: lemma of type string, corresponding to a shadow word + + Returns: + A boolean value indicating whether the two input words are + different conjugations of the same verb. + """ + return self._reg_verb_helper(source_word, shadow_word) or \ + self._reg_verb_helper(shadow_word, source_word) + + def _reg_verb_helper(self, word1, word2): + """ + Help the _reg_verb_related function, this function is one-directional + + Args: + word1: lemma of type string + word2: lemma of type string + + Returns: + A boolean value indicating whether word1 and word2 are different + conjugations of the same verb. + """ + stem = self._get_stem(word1) + vdw1, vdw2 = self._create_vdws(stem) if vdw1 == word2 or vdw2 == word2: return True for affix in self._verb_affixes: @@ -107,51 +130,91 @@ class FormChecker: return True return False - def create_vdws(self, stem): + def _create_vdws(self, stem): + """ + Creates the perfect time form of the verb based on the stem + + Args: + stem: the stem of a word as determined by other functions + + Returns: + The two possible perfect time forms based on the stem + + """ word = 'ge' + stem for v_prefix in self._verb_prefixes: - if stem[:len(v_prefix)] == v_prefix: - word = stem[2:] + if stem[:len(v_prefix)] == v_prefix and len(stem) > \ + len(v_prefix) + 1: + word = word[2:] word_1 = word+'d' word_2 = word+'t' return word_1, word_2 - def get_stem(self, word): + def _get_stem(self, word): + """ + Find the stem of the word given as input + + Args: + word: Lemma of type String + + Returns: + The stem of the word given as input + """ stem = '' for affix in self._verb_affixes: length = len(affix) if (word[-length:] == affix or affix == '') \ - and not self.is_vdw(word): - if affix == '': - stem = word - elif len(word) == len(affix): + and not self._is_vdw(word): + if affix == '' or len(word) == length: stem = word else: stem = word[:-length] - stem = self.create_single_stem(stem, affix) + stem = self._create_single_stem(stem, affix) return stem elif word[-length:] == affix: stem = word[2:-length] return stem return stem - def is_vdw(self, word): - return word[:2] == 'ge' and (word[-1]=='t' or word[-1]=='d') + def _is_vdw(self, word): + """ + Check whether the input word is in perfect time form - def create_single_stem(self, stem, affix): + Args: + word: Lemma of type string + + Returns: + A boolean value indicating whether the input word is in perfect + time form. + """ + return word[:2] == 'ge' and (word[-1] == 't' or word[-1] == 'd') + + def _create_single_stem(self, stem, affix): + """ + Find the single-person stem based on the stem and affix + + Args: + stem: The stem of a word, of type String + affix: The affix which was found, which was used in creating the + stem + + Returns: + The single-person stem of the stem given as input, based on the + stem itself and the affix + """ if affix == 'en' and len(stem)>2 and stem != affix: if stem[-1] in self._consonants and stem[-1] == stem[-2]: - stem= stem[:-1] - elif stem[-1] in self._consonants and stem[-2] in self._vowels and \ - stem[-3] in self._consonants: - stem= stem[:-2] + stem[-2] + stem[-2:] + stem = stem[:-1] + elif stem[-1] in self._consonants and stem[-2] in self._vowels \ + and stem[-3] in self._consonants: + stem = stem[:-1] + stem[-2:] if stem[-1] == 'z': return stem[:-1] + 's' elif stem[-1] == 'v': return stem[:-1] + 'f' return stem - def fix_related(self, source_word, shadow_word): + def _fix_related(self, source_word, shadow_word): """ Check whether the arguments are related in terms of their prefixes and affixes. @@ -164,10 +227,10 @@ class FormChecker: A boolean value indicating whether the arguments are related in terms of their prefixes and affixes. """ - return self.prefix_related(source_word, shadow_word) or \ - self.affix_related(source_word, shadow_word) + return self._prefix_related(source_word, shadow_word) or \ + self._affix_related(source_word, shadow_word) - def prefix_related(self, source_word, shadow_word): + def _prefix_related(self, source_word, shadow_word): """ Check whether the arguments are related in terms of their prefixes @@ -188,7 +251,7 @@ class FormChecker: return True return False - def affix_related(self, source_word, shadow_word): + def _affix_related(self, source_word, shadow_word): """ Check whether the arguments are related in terms of their prefixes and postfixes. @@ -208,4 +271,4 @@ class FormChecker: for aff2 in self._affixes: if shadow_rest + aff2 == source_word: return True - return False \ No newline at end of file + return False