diff --git a/umbra/alignment_strategy.py b/umbra/alignment_strategy.py index 7dda66a8cdbac52b6808195837a519f20c5e13a0..05fcd1319b0e2d0bceef9d2a0ec4ee5cb49a85e9 100644 --- a/umbra/alignment_strategy.py +++ b/umbra/alignment_strategy.py @@ -28,7 +28,6 @@ class AlignmentStrategy(ABC): total_words += 1 if source_word.shadowed: correct += 1 - print(source_word) else: skipped += 1 print("Total: %d | Correctly shadowed: %d | Skipped: %d" diff --git a/umbra/anchor_algorithm.py b/umbra/anchor_algorithm.py index 14c8d314cad139b8bd64b98d2403056e13c187c1..38d97004628da033591eef01c1793a167b1ade19 100644 --- a/umbra/anchor_algorithm.py +++ b/umbra/anchor_algorithm.py @@ -11,7 +11,12 @@ class AnchorAlgorithm(AlignmentStrategy): self._shadow = None def align(self, source, shadow): - """Start the comparison and eventually prints the results.""" + """Start the comparison and eventually prints the results. + + Args: + source: the source sentence + shadow: the shadow sentence + """ self._source = source self._shadow = shadow discrete_start_time = time.time() @@ -52,8 +57,7 @@ class AnchorAlgorithm(AlignmentStrategy): for src_index in range(src_start, src_end): src_word = self._source[src_index] if not src_word.is_anchor(): - self._search_word_in_interval(src_word, shd_start, shd_end, - source, shadow) + self._search_word_in_interval(src_word, shd_start, shd_end) else: # If src_word is an anchor, only search in shadow after it: shd_start = self._shadow.index(src_word.anchor()) @@ -70,7 +74,7 @@ class AnchorAlgorithm(AlignmentStrategy): competing_over = None for shadow_index in range(shd_start, shd_end): shd_word = self._shadow[shadow_index] - if src_word.word == shd_word.word and not found: + if src_word.__eq__(shd_word) and not found: if 0.05 < src_word.get_difference(shd_word) < 3.0: if not shd_word.correct: found = True @@ -82,7 +86,7 @@ class AnchorAlgorithm(AlignmentStrategy): competing_over = shd_word # If true, then competing_over was wrongly flagged as correct: if competing_over is not None: - competing_over.source().shadowed = False + competing_over.source.shadowed = False competing_over.source = src_word # Shadow to new source word src_word.shadowed = True diff --git a/umbra/controller.py b/umbra/controller.py index b4f63999b52aa9c198b7b4ade7c06348e2547784..f49ed42e7f2c2ae5b1100e574221c9e9cb1d8001 100644 --- a/umbra/controller.py +++ b/umbra/controller.py @@ -12,7 +12,7 @@ class Controller: self._view = view self._view.actionlistener = self.actionlistener - self._filereader = CSVReader("","") # TODO: Deal with csv OR txt + self._filereader = CSVReader() # TODO: Deal with csv OR txt self._filewriter = CSVWriter() # Lists of paths, represented as string @@ -79,8 +79,8 @@ class Controller: self._view.update_message('no shadow') else: self._view.update_message('files ok') - self._read_files("source") - self._read_files("shadow") + self._read_files(self._source_files[0],"source") # TODO: obviously extend to not only reading first one + self._read_files(self._shadow_files[0],"shadow") # TODO: fine for now, the model can only handle two anyways self._model.compare() self._view.update_message('comparison complete') @@ -95,19 +95,14 @@ class Controller: # results are of form sc, sh, info self._view.update_message('saved') - def _read_files(self, type): - """Read data from file paths and save to model. + def _read_files(self, path, word_type): + """Read data from file path and save to model. Args: - type (str): Role of file ('source' or 'shadow') + word_type (str): Role of file ('source' or 'shadow') """ - # TODO - # Temporary 'solution' for testing below - path = self._source_files[0] - print(path) - self._filereader = CSVReader(path, type) - data = self._filereader.read() - if type == "source": # Fix is dirtier than I can talk + data = self._filereader.read(path,word_type) + if word_type == "source": self._model._data_source = data else: self._model._data_shadow = data diff --git a/umbra/filereader.py b/umbra/filereader.py index a74e0c063349a6f6804c2c1bc815ae16d78a7178..b35c2589938167d6ca60eff7e468a25d8e552b41 100644 --- a/umbra/filereader.py +++ b/umbra/filereader.py @@ -9,7 +9,7 @@ class FileReader (ABC): """Read source and shadow files from given path.""" @abstractmethod - def read(self): + def read(self, path, word_type): pass @staticmethod @@ -47,50 +47,43 @@ class FileReader (ABC): else: word = ShadowWord(row.Word.lower(), row.Onset, row.Offset) ws.append(word) - words = Sentence(ws) + words = Sentence(ws) return words class CSVReader(FileReader): - def __init__(self, path, type): - """ + + def read(self, path, word_type): + """Method that is used to read the data into a workable format + Args: - path: string, the internal path to the source file - """ - self._path = path - self._type = type # Very dirty fix + path: the path that to the file that is to be read + word_type: specifying whether it is a source or shadow file + + Returns: - def read(self): - """Method that is used to read the data into a workable format""" - df = pd.read_csv(self._path, header=None, sep='\n') + """ + df = pd.read_csv(path, header=None, sep='\n') df = df[0].str.split('\t', expand=True) data = self.extract_task_data(df) - words = self.df_to_words(data, self._type) + words = self.df_to_words(data, word_type) return words class TxtReader(FileReader): - def __init__(self, path): - """Constructor - - Args: - path: path to file - """ - self._path = path - self._words = None - def read(self): + def read(self, path, word_type): """ Read data into Sentences. Returns: words: Sentence containing Words. """ - with open(self._path, 'r') as data: - self._words = self.extract_task_data(data) - self._words = self.df_to_words(self._words, "header") - return self._words + with open(path, 'r') as data: + words = self.extract_task_data(data) + words = self.df_to_words(self._words, "header") + return words class FileWriter(ABC): diff --git a/umbra/saa_Romeo.py b/umbra/saa_Romeo.py index 6b7d87478cfe7ed0e0f037793946a4ef57900fc4..1ff17c9abdf38cb3c0c9336fd4a354fbe40bb847 100644 --- a/umbra/saa_Romeo.py +++ b/umbra/saa_Romeo.py @@ -14,6 +14,8 @@ class SaaRomeo(AlignmentStrategy): self._mismatch = -2 self._gap_sc = -1 self._pointers = ['diag', 'up', 'left'] + self._source = None + self._shadow = None def align(self, source, shadow): """ This is the main function of finding alignments. @@ -29,28 +31,25 @@ class SaaRomeo(AlignmentStrategy): alignment_shadow: The found alignment for the shadow file """ discrete_start_time = time.time() - matrix = self._initialize_matrix(source, shadow) - matrix = self._fill_matrix(matrix, source, shadow) - alignment_source, alignment_shadow = self._traceback(matrix, - source, shadow) + self._source = source + self._shadow = shadow + matrix = self._initialize_matrix() + matrix = self._fill_matrix(matrix) + alignment_source, alignment_shadow = self._traceback(matrix) discrete_time = time.time() - discrete_start_time print(f'taken time:{discrete_time}') - return alignment_source, alignment_shadow + return self._source, self._shadow - def _initialize_matrix(self, source, shadow): + def _initialize_matrix(self): """The matrix is initialized according to the Needleman-Wunsch algorithm - Args: - source: Sentence of Word instances of the source file - shadow: Sentence of Word instances of the shadow file - Returns: matrix: A matrix containing the values and pointers, the latter indicating what shift in the matrix we take (up, up-left or left). The matrix that will be returned will have the default values """ - n = len(source) - m = len(shadow) + n = len(self._source) + m = len(self._shadow) matrix = np.array([[{'value': self._gap_sc*y, 'pointer': 'up'} if x == 0 else {'value': self._gap_sc*x, 'pointer': 'left'} @@ -60,26 +59,24 @@ class SaaRomeo(AlignmentStrategy): for y in range(m + 1)]) return matrix - def _fill_matrix(self, matrix, source, shadow): + def _fill_matrix(self, matrix): """The matrix is filled according to the Needleman-Wunsch algorithm Args: matrix: A matrix containing the values and pointers, the latter indicating what shift in the matrix we take (up, up-left or left). Now the matrix still has its default values - source: Sentence of Word instances of the source file - shadow: Sentence of Word instances of the shadow file Returns: matrix: A matrix containing the values and pointers, the latter indicating what shift in the matrix we take (up, up-left or left). Now the matrix's values and pointers are updated """ - n = len(source) - m = len(shadow) + n = len(self._source) + m = len(self._shadow) for i in range(1, m+1): for j in range(1, n+1): - if shadow[i-1].__eq__(source[j-1]): + if self._shadow[i-1].__eq__(self._source[j-1]): value = self._match else: value = self._mismatch @@ -92,55 +89,51 @@ class SaaRomeo(AlignmentStrategy): self._pointers[np.argmax([match_value, delete, insert])] return matrix - def _traceback(self, matrix, source, shadow): + def _traceback(self, matrix): """Traces back to top left to print the found alignment. Args: matrix: A matrix containing the values and pointers, the latter indicating what shift in the matrix we take (up, up-left or left) - source: Sentence of Word instances of the source file - shadow: Sentence of Word instances of the shadow file Returns: alignment_source: The alignment of source words alignment_shadow: The alignment of shadow words """ - j = len(source) - i = len(shadow) + j = len(self._source) + i = len(self._shadow) alignment_source = [] alignment_shadow = [] while i > 0 or j > 0: if matrix[i][j]['pointer'] == 'diag': - alignment_source.append(source[j - 1]) - alignment_shadow.append(shadow[i - 1]) + alignment_source.append(self._source[j - 1]) + alignment_shadow.append(self._shadow[i - 1]) + if self._source[j-1].__eq__(self._shadow[i-1]): + self._source[j-1].shadowed = True + self._shadow[i-1].source = self._source[j-1] i -= 1 j -= 1 elif matrix[i][j]['pointer'] == 'left': - alignment_source.append(source[j - 1]) + alignment_source.append(self._source[j - 1]) alignment_shadow.append(Gap()) j -= 1 elif matrix[i][j]['pointer'] == 'up': alignment_source.append(Gap()) - alignment_shadow.append(shadow[i - 1]) + alignment_shadow.append(self._shadow[i - 1]) i -= 1 #Finish tracing back to top-left while j > 0: - alignment_source.append(source[j - 1]) + alignment_source.append(self._source[j - 1]) alignment_shadow.append(Gap()) j -= 1 while i > 0: alignment_source.append(Gap()) - alignment_shadow.append(shadow[i - 1]) + alignment_shadow.append(self._shadow[i - 1]) i -= 1 alignment_source.reverse() alignment_shadow.reverse() - for source_word, shadow_word in zip(alignment_source, alignment_shadow): - if type(source_word) is not Gap and type(shadow_word) is not Gap: - if source_word == shadow_word: - source_word.shadowed = True - return Sentence(alignment_source), Sentence(alignment_shadow) diff --git a/umbra/statistics.py b/umbra/statistics.py index ae8ec93270e19633c8e35480622d506d37ac7103..b124b9f7bb516d14eaeb8e5564898bfb760392a5 100644 --- a/umbra/statistics.py +++ b/umbra/statistics.py @@ -31,29 +31,25 @@ class Statistics: print('Romeo') self._strategy = SaaRomeo() source_align, shadow_align = self._strategy.align(source, shadow) - print([str(x) for x in source_align]) - print([str(x) for x in shadow_align]) + for s_word in shadow_align: + if s_word.has_source(): + print(f'source: {s_word.source} shadow: {s_word}') correctness = self._strategy.correctly_shadowed(source) # Reset the is_shadowed property for word in source: word.shadowed = False - ## The other strategies have as of yet not been adapted - # #Alignment 1 - # print('\n Thijs') - # self._strategy = SaaAlgorithm(words_source, words_shadow) - # source_align, shadow_align = self._strategy.align() - # print([str(x) for x in source_align]) - # print([str(x) for x in shadow_align]) - # correctness = self._strategy.correctly_shadowed(words_source) - # + # The other strategies have as of yet not been adapted + # Alignment 1 (Thijs) + # Alignment 2 print('\n Emma') self._strategy = AnchorAlgorithm() - source_align, shadow_align = self._strategy.align(source, shadow) - print([str(x) for x in source_align]) - print([str(x) for x in shadow_align]) + source_align_em, shadow_align_em = self._strategy.align(source, shadow) + for s_word in shadow_align_em: + if s_word.has_source(): + print(f'source: {s_word.source} shadow: {s_word}') correctness = self._strategy.correctly_shadowed(source) return source_align, shadow_align, correctness diff --git a/umbra/words.py b/umbra/words.py index 2a9c2feb3e38c2fb2a8034f3168e9653760ede73..acbb63e91198f0f49fd60b6e68b32a8f58817d2f 100644 --- a/umbra/words.py +++ b/umbra/words.py @@ -106,7 +106,7 @@ class ShadowWord(Word): """Getter for Source attribute Return: - The source. + The source word. """ return self._source @@ -119,12 +119,18 @@ class ShadowWord(Word): """ self._source = source + def has_source(self): + """Check whether this word has a source word that it is matched with + + Returns: + source: True if this word is matched, False otherwise. + """ + return self._source is not None class SourceWord(Word): def __init__(self, word, onset, offset): super().__init__(word, onset, offset) self._shadowed = False - self._source = None @property def shadowed(self): @@ -144,32 +150,6 @@ class SourceWord(Word): """ self._shadowed = value - @property - def source(self): - """Getter for the source attribute - - Returns: - source: the word in the source file this word is aligned with - """ - return self._source - - @source.setter - def source(self, source): - """Set which word this one is aligned with - - Args: - source: the source word that this word is set to be aligned with - """ - self._source = source - - def get_difference(self, other): - """Get the difference between the onset of this word and the other. - - Args: - other: the other Word instance - """ - return other.onset - self._onset - class Sentence(list): def __init__(self, words):