Nijsen, T.J.P. (Thomas) · Alfen, T. van (Tanja)
--- a/Solution Concept/T word algorithm.py

+ 28

− 16
+++ b/Solution Concept/T word algorithm.py

+ 28

− 16
 @@ -6,7 +6,7 @@
 import numpy as np

 # Input sequences
-S1 = ["This", "is", "an", "example", "sequence", "This", "is" "the" "original"]
+S1 = ["This", "is", "an", "example", "sequence", "This", "is", "the", "original"]
 S2 = ["This", "is", "another", "example", "sequence", "This", "is", "not", "the", "original"]

 # Scoring system
 @@ -40,40 +40,52 @@ def word_to_vector(word, dictionary, scores):


 def word_similarity(word1, word2, avr_latency):
-    """"Takes in 2 words as vectors and calculates the cosine similarity measure.
+    """"Takes in 2 words as vectors and calculates the similarity measure.
+    Currently based on a distance measure instead of cosine. Cosine produced
+    a lot of NaNs and required too much normalization.
    Uses latency scores as well. In this case as a correcting term.
    Concept can be extended this measure and refined for latencies. """

-    val = np.dot(word1, word2) / (np.linalg.norm(w1) * np.linalg.norm(w2))
-    theta = np.arccos(1/val)
-    # Correcting term for latency can be made non-linear as not punish too strongly
-    return theta * (0.00001 + avr_latency) + avr_latency
+    # Use normalization by latency as an example of how it could be implemented.
+    dist = np.linalg.norm(word2 - word1) / avr_latency
+
+    # Correcting term for latency can be made non-linear as not punish too strongly,
+    # square root is a good example.
+    return dist + np.sqrt(avr_latency)


 def sequence_alignment(seq1, seq2, avr_latency, dictionary, scores):
-    align_score = 0
+    """"Return all alignment values between words based on similarity."""
+    align_distribution = []

    # Convert all to vectors
-    for sw1, sw2 in zip(seq1, seq2):
-        v1, v2 = word_to_vector(sw1, dictionary, scores), word_to_vector(sw2, dictionary, scores)
+    vseq1 = list(map(lambda x: word_to_vector(x, dictionary, scores), seq1))
+    vseq2 = list(map(lambda x: word_to_vector(x, dictionary, scores), seq2))

    # Calculate similarity scores
+    for word1, v1 in zip(seq1, vseq1):
+        for word2, v2 in zip(seq2, vseq2):
+            align_distribution.append(word1 + " - " + word2 + " : " + str(word_similarity(v1, v2, avr_latency)))

-
-    return align_score
+    return align_distribution


 # Words test 1
 w1 = word_to_vector("This", letters, inverses)
 w2 = word_to_vector("This", letters, inverses)
-sim_score1 = word_similarity(w1, w2, 10 * 10 ** (-3))
+sim_score1 = word_similarity(w1, w2, 10 * 10 ** (-3))  # Result: low score, almost 0. Expected property.

 w3 = word_to_vector("Thos", letters, inverses)
-sim_score2 = word_similarity(w1, w3, 10 * 10 ** (-3))
+sim_score2 = word_similarity(w1, w3, 10 * 10 ** (-3))  # Result: High score, maybe a bit too harsh now.

 w4 = word_to_vector("This", letters, inverses)
-sim_score3 = word_similarity(w1, w4, 170 * 10 ** (-3))
-
-print(sim_score1, sim_score2, sim_score3)
+sim_score3 = word_similarity(w1, w4, 170 * 10 ** (-3))  # Result: Higher than 1 & lower than 2, seems a good compromise.

+print("Words test 1: ", sim_score1, sim_score2, sim_score3)

+# Sequence test 1
+print("Sequence test 1: ", sequence_alignment(S1, S2, 10 * 10 ** (-3), letters, inverses))
+"""" High variability in results, seems still to share some nice properties. Probably with a bit of refinement it
+     could be made more reliable and the scores more sensible. Overall, the current measure is simple and seems to 
+     line up with intuition. Can also be easily extended into multivariate functions or kernels (especially useful to
+     deal with high or even infinite dimensional data). """