Skip to content
Snippets Groups Projects
Commit 5e7429af authored by Alfen, T. van (Tanja)'s avatar Alfen, T. van (Tanja)
Browse files

Merge branch 'SC-143/phonetics_in_NW'

parents 3cff9580 422bd37e
No related branches found
No related tags found
2 merge requests!79Master,!72Sc 143/phonetics in nw
from mistake_enum import Mistake
from dutch_mmetaphone import DutchPhonetics
class MistakeFinder:
......@@ -28,7 +29,7 @@ class MistakeFinder:
# Loop over the shadow:
for index, word in enumerate(self._shadow):
if not word.correct and word.mistake is None:
self._determine_mistake(index,word)
self._determine_mistake(index, word)
# Loop over the source:
for word in self._source:
if not word.shadowed and word.mistake is None:
......@@ -37,8 +38,7 @@ class MistakeFinder:
# PRINT FOR TEST OF SO FAR IMPLEMENTED MISTAKE CHECKS:
for word in self._shadow:
if word.mistake == Mistake.REPETITION or word.mistake\
== Mistake.SEMANTIC:
if not word.correct and word.mistake != Mistake.RANDOM:
print(str(word) + " " + str(word.mistake))
def _determine_mistake(self, index, word):
......@@ -52,11 +52,40 @@ class MistakeFinder:
"""
if not self._check_repetition(word, index): # If not a repetition,
# if not self._check_form_mistake(word): # form,
if not self._check_semantic_mistake(word,index): # semantic,
# if not self._check_phonentic_mistake(word): # phonetic;
if not self._check_semantic_mistake(word, index): # semantic,
if not self._check_phonetic_mistake(word, index): # phon;
# ... then shadow word is random:
word.mistake = Mistake.RANDOM
def _check_phonetic_mistake(self, shd_word, index):
"""Checks if shd_word is a phonetic mistake or not. If yes, it flags
shd_word and the source word it belongs to as phonetic mistakes.
Args:
shd_word: the Word which is checked for semantic mistake.
index: index of shd_word in shadow.
Returns:
phonetic_mistake: True if shd_word is a phonetic mistake, False
if not.
"""
phonetic_mistake = False
shd_anchor_index = self._shadow.find_previous_anchor(index)
if shd_anchor_index < 0: # If there is no anchor found,
src_index = 0 # then just start at the beginning.
else:
src_index = self._source.index(
self._shadow[shd_anchor_index].anchor)
while self._source[src_index].get_difference(shd_word) > 0 and \
not phonetic_mistake and src_index < (
len(self._source) - 1): # Crude fix, but removes error
src_word = self._source[src_index]
if DutchPhonetics.compare(src_word.word, shd_word.word):
shd_word.mistake = Mistake.PHONETIC
src_word.mistake = Mistake.PHONETIC
phonetic_mistake = True
src_index += 1
return phonetic_mistake
def _check_semantic_mistake(self, shd_word, index):
"""Checks if shd_word is a semantic mistake or not. If yes, it flags
shd_word and the source word it belongs to as semantic mistakes.
......@@ -76,7 +105,8 @@ class MistakeFinder:
src_index = self._source.index(
self._shadow[shd_anchor_index].anchor)
while self._source[src_index].get_difference(shd_word) > 0 and\
not semantic_mistake and src_index < (len(self._source)-1): # Crude fix, but removes error
not semantic_mistake and src_index < (
len(self._source)-1): # Crude fix, but removes error
src_word = self._source[src_index]
if self.semantically_related(src_word, shd_word):
shd_word.mistake = Mistake.SEMANTIC
......@@ -100,8 +130,8 @@ class MistakeFinder:
assert 0 <= index < len(self._shadow)
found = self._check_pre_repetition(word, index)
if not found: # Check if word is the start of the next word.
if index < len(self._shadow)-1 and self._shadow[index+1].word.find(
word.word) == 0:
if index < len(self._shadow)-1 and self._shadow[index+1].word.\
startswith(word.word):
word.mistake = Mistake.REPETITION
found = True
return found
......@@ -125,9 +155,7 @@ class MistakeFinder:
while not found_before and not stop and i > 0:
i -= 1
stop = self._shadow[i].is_anchor()
found_at = self._shadow[i].word.find(word.word)
if found_at >= 0 and found_at == len(self._shadow[i].word)-\
len(word.word):
if self._shadow[i].word.endswith(word.word):
found_before = True
chain = found_before
......@@ -171,19 +199,5 @@ class MistakeFinder:
self._source = source
self._shadow = shadow
for word in self._shadow:
if word.mistake == Mistake.REPETITION or word.mistake\
== Mistake.SEMANTIC:
if not word.correct and word.mistake != Mistake.RANDOM:
print(str(word) + " " + str(word.mistake))
def _check_phonetic_mistake(self, shd_word):
"""
Check whether a word can be seen as a phonetic mistake
Args:
shd_word: An instance of the Words class
Returns:
A boolean value indicating whether the word can be seen as a
phonetic mistake.
"""
pass
......@@ -3,6 +3,7 @@ from alignment_strategy import AlignmentStrategy
from words import Sentence
import numpy as np
from mistake_enum import Mistake
from dutch_mmetaphone import DutchPhonetics
class NeedlemanWunsch(AlignmentStrategy):
......@@ -15,6 +16,7 @@ class NeedlemanWunsch(AlignmentStrategy):
self._mismatch = -2
self._gap_sc = -1
self._seman_match = 2
self._phon_match = 2
self._repetition = 0
self._form_match = 2
self._pointers = ['diag', 'up', 'left']
......@@ -36,6 +38,8 @@ class NeedlemanWunsch(AlignmentStrategy):
gap_sc: the score that is allocated for a gap
seman_match: the score that is allocated when two words align
by virtue of semantic equivalence
phon_match: the score that is allocated when two words sound the
same.
repetition: the score that is allocated when a shadow word is
a stuttering
"""
......@@ -47,6 +51,8 @@ class NeedlemanWunsch(AlignmentStrategy):
self._gap_sc = gap_sc
if seman_match:
self._seman_match = seman_match
if phon_match:
self._phon_match = phon_match
if repetition:
self._repetition = repetition
if form_match:
......@@ -96,15 +102,20 @@ class NeedlemanWunsch(AlignmentStrategy):
n = len(self._source)
m = len(self._shadow)
for i in range(1, m+1):
shadow_word = self._shadow[i - 1]
for j in range(1, n+1):
if self._source[j - 1] == self._shadow[i - 1]:
source_word = self._source[j - 1]
if source_word == shadow_word:
value = self._match
elif self._form_checker.form_related(self._source[j-1].word,
self._shadow[i-1].word):
value = self._form_match
elif self._seman_checker.semantically_related(
self._source[j-1].word, self._shadow[i-1].word):
source_word.word, shadow_word.word):
value = self._seman_match
elif DutchPhonetics.compare(source_word.word,
shadow_word.word):
value = self._phon_match
else:
value = self._mismatch
match_value = self._matrix[i-1, j-1]['value'] + value
......@@ -154,6 +165,8 @@ class NeedlemanWunsch(AlignmentStrategy):
alignment_shadow.append(self._shadow[i - 1])
if self._check_repetition(i - 1):
self._shadow[i - 1].mistake = Mistake.REPETITION
else:
self._shadow[i - 1].mistake = Mistake.RANDOM
i -= 1
alignment_source.reverse()
......@@ -194,4 +207,10 @@ class NeedlemanWunsch(AlignmentStrategy):
source.shadowed = True
source.mistake = Mistake.FORM
shadow.mistake = Mistake.FORM
elif DutchPhonetics.compare(source.word, shadow.word):
source.shadowed = True
source.mistake = Mistake.PHONETIC
shadow.mistake = Mistake.PHONETIC
else:
shadow.mistake = Mistake.RANDOM
return source, shadow
......@@ -16,6 +16,7 @@ class Word:
return 1
def __eq__(self, word):
# TODO: move self._onset < word.onset out of __eq__
return self._word == word.word and self._onset < word.onset
@property
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment