Skip to content
Snippets Groups Projects

Sc 76

Merged Nijsen, T.J.P. (Thomas) requested to merge SC-76 into master
1 file
+ 28
16
Compare changes
  • Side-by-side
  • Inline
@@ -6,7 +6,7 @@
import numpy as np
# Input sequences
S1 = ["This", "is", "an", "example", "sequence", "This", "is" "the" "original"]
S1 = ["This", "is", "an", "example", "sequence", "This", "is", "the", "original"]
S2 = ["This", "is", "another", "example", "sequence", "This", "is", "not", "the", "original"]
# Scoring system
@@ -40,40 +40,52 @@ def word_to_vector(word, dictionary, scores):
def word_similarity(word1, word2, avr_latency):
""""Takes in 2 words as vectors and calculates the cosine similarity measure.
""""Takes in 2 words as vectors and calculates the similarity measure.
Currently based on a distance measure instead of cosine. Cosine produced
a lot of NaNs and required too much normalization.
Uses latency scores as well. In this case as a correcting term.
Concept can be extended this measure and refined for latencies. """
val = np.dot(word1, word2) / (np.linalg.norm(w1) * np.linalg.norm(w2))
theta = np.arccos(1/val)
# Correcting term for latency can be made non-linear as not punish too strongly
return theta * (0.00001 + avr_latency) + avr_latency
# Use normalization by latency as an example of how it could be implemented.
dist = np.linalg.norm(word2 - word1) / avr_latency
# Correcting term for latency can be made non-linear as not punish too strongly,
# square root is a good example.
return dist + np.sqrt(avr_latency)
def sequence_alignment(seq1, seq2, avr_latency, dictionary, scores):
align_score = 0
""""Return all alignment values between words based on similarity."""
align_distribution = []
# Convert all to vectors
for sw1, sw2 in zip(seq1, seq2):
v1, v2 = word_to_vector(sw1, dictionary, scores), word_to_vector(sw2, dictionary, scores)
vseq1 = list(map(lambda x: word_to_vector(x, dictionary, scores), seq1))
vseq2 = list(map(lambda x: word_to_vector(x, dictionary, scores), seq2))
# Calculate similarity scores
for word1, v1 in zip(seq1, vseq1):
for word2, v2 in zip(seq2, vseq2):
align_distribution.append(word1 + " - " + word2 + " : " + str(word_similarity(v1, v2, avr_latency)))
return align_score
return align_distribution
# Words test 1
w1 = word_to_vector("This", letters, inverses)
w2 = word_to_vector("This", letters, inverses)
sim_score1 = word_similarity(w1, w2, 10 * 10 ** (-3))
sim_score1 = word_similarity(w1, w2, 10 * 10 ** (-3)) # Result: low score, almost 0. Expected property.
w3 = word_to_vector("Thos", letters, inverses)
sim_score2 = word_similarity(w1, w3, 10 * 10 ** (-3))
sim_score2 = word_similarity(w1, w3, 10 * 10 ** (-3)) # Result: High score, maybe a bit too harsh now.
w4 = word_to_vector("This", letters, inverses)
sim_score3 = word_similarity(w1, w4, 170 * 10 ** (-3))
print(sim_score1, sim_score2, sim_score3)
sim_score3 = word_similarity(w1, w4, 170 * 10 ** (-3)) # Result: Higher than 1 & lower than 2, seems a good compromise.
print("Words test 1: ", sim_score1, sim_score2, sim_score3)
# Sequence test 1
print("Sequence test 1: ", sequence_alignment(S1, S2, 10 * 10 ** (-3), letters, inverses))
"""" High variability in results, seems still to share some nice properties. Probably with a bit of refinement it
could be made more reliable and the scores more sensible. Overall, the current measure is simple and seems to
line up with intuition. Can also be easily extended into multivariate functions or kernels (especially useful to
deal with high or even infinite dimensional data). """
Loading