import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from thefuzz import fuzz
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from matplotlib import pyplot as plt
import networkx as nx
import gravis as gv
from sklearn.model_selection import train_test_split
import math

df = pd.read_csv('clustered_typos.csv',index_col=0)

df.head()

df.groupby('cluster').count().plot(kind='hist',title='Size of Clusters',xlabel='Clusters', ylabel='Size', legend=None)

<Axes: title={'center': 'Size of Clusters'}, xlabel='Clusters', ylabel='Size'>

df.groupby('cluster').count().mean()

word    6.622517
dtype: float64

df.groupby('cluster').count().sort_values(by='word')

unique_words = list(set(df['word']))

def normalize(text: set)->list:
    'transform every word in text to lower case'
    return [text[word].lower() if text[word] is not None else '' for word in range(len(text))]

text = normalize(unique_words)

def cos_sim(text: list)->np.array:
    'calculate the cosine similarity for every word in text'
    
    sep_list = [list(text[i]) for i in range(len(text))]
    corpus = [' '.join(sep_list[i]) for i in range(len(sep_list))]
    vect = CountVectorizer(tokenizer=lambda txt: txt.split(' '))
    X = vect.fit_transform(corpus)
    X = X.toarray()
    cos_array = cosine_similarity(X, X)

    #Set similarity to its own to 0
    for i in range(len(cos_array)):
        cos_array[i][i]=0
    
    return cos_array

cos_array = cos_sim(text)

def fuzz_ratio(text: list)->np.array:
    'calculate the fuzz/levenshtein similarity for every word in text'

    #Initiate empty cluster matrix
    mat_cluster = np.zeros([len(text),len(text)],dtype=np.int8)

    #Calculate fuzz similarity for every possible unique combination
    for idx1, w1 in enumerate(text):
        for idx2, w2 in enumerate(text):
            mat_cluster[idx1, idx2] = fuzz.ratio(w1, w2)

    #Scale ratio betwenn 0 and 1
    mat_cluster = mat_cluster / 100

    #Set similarity to its own to 0
    for i in range(len(mat_cluster)):
        mat_cluster[i][i] = 0

    return mat_cluster

fuzz_array = fuzz_ratio(text)

def sub_string(text: list, MIN_LEN_SUB=2)->np.array:
    'calculate the sub string similarity for every word in text'

    #Initiate empty cluster matrix
    mat_cluster = np.zeros([len(text),len(text)],dtype=np.int8)

    #Calculate sub string similarity for every possible unique combination
    for w1 in range(len(text)):
        if len(text[w1]) > MIN_LEN_SUB:
            for w2 in range(len(text)):
                if len(text[w2]) > MIN_LEN_SUB and w1 != w2:
                    if text[w1] in text[w2]:
                        mat_cluster[w1,w2] = 1
    
    #Mirror Similarity-Score in matrix
    for i in range(len(mat_cluster[:,0])):
        for j in range(len(mat_cluster[0,:])):
            mat_cluster[i,j] = mat_cluster[j,i] = max(mat_cluster[i,j],mat_cluster[j,i])
    
    return mat_cluster

sub_array = sub_string(text)

def jaccard(text:list)->np.array:
    'calculate the cosine similarity for every word in text'
    
    #Initiate empty cluster matrix
    mat_cluster_2 = np.zeros([len(text),len(text)])

    #Calculate jaccard similarity for every possible unique combination
    for idx1, w1 in enumerate(text):
        for idx2, w2 in enumerate(text[:idx1]):
            inter = set(text[idx1]).intersection(set(text[idx2]))
            un = set(text[idx1]).union(set(text[idx2]))
            mat_cluster_2[idx1, idx2] = len(inter) / len(un)

    #Mirror Similarity-Score in matrix
    for i in range(len(mat_cluster_2[:,0])):
        for j in range(len(mat_cluster_2[0,:])):
            mat_cluster_2[i,j] = mat_cluster_2[j,i] = max(mat_cluster_2[i,j],mat_cluster_2[j,i])
            
    return mat_cluster_2

jac_array = jaccard(text)

class name_clustering:
    
    def __init__(self):
        pass

    def matrix(self, matrix, weight_list):
        'build weighted combined matrix of all input matrices'
        
        if len(matrix.shape) == 2:
            assert len(weight_list) == 1

            #Initialize weighted matrix
            self.weighted_matrix = np.zeros_like(matrix)

            #Fill weighted matrix
            dimension = 0
            self.weighted_matrix += weight_list[dimension] * matrix[:,:]

        else:
            #Initialize weighted matrix
            self.weighted_matrix = np.zeros_like(matrix[0])

            #Fill weighted matrix
            for dimension in range(len(weight_list)):
                self.weighted_matrix += weight_list[dimension] * matrix[dimension, :, :]
        
        return self.weighted_matrix

    def create_adjacency_matrix_strict(self, matrix, THRESHOLD):
        'build adjacency matrix of weighted matrix'

        #build adjacency matrix
        num_words = len(matrix)
        self.adjacency_matrix = [[0]*num_words for _ in range(num_words)]

        def in_same_cluster(word1, word2):
            'Check if words are in the same cluster'
            return matrix[word1][word2] > THRESHOLD

        def check_cluster(cluster):
            'Check if all words in cluster have the needed similarity'
            for i in range(len(cluster)):
                for j in range(i + 1, len(cluster)):
                    if not in_same_cluster(cluster[i], cluster[j]):
                        return False
            return True

        #Iterate over each pair of words and update the adjacency matrix
        for i in range(num_words):
            for j in range(i + 1, num_words):
                if in_same_cluster(i, j):
                    self.adjacency_matrix[i][j] = 1
                    self.adjacency_matrix[j][i] = 1

        #Iterate over each pair of words outside the cluster and reset their connection
        for i in range(num_words):
            for j in range(num_words):
                if not in_same_cluster(i, j):
                    self.adjacency_matrix[i][j] = 0
                    self.adjacency_matrix[j][i] = 0

        #Iterate over each cluster and check the minimum similarity
        for i in range(num_words):
            cluster = [j for j in range(num_words) if self.adjacency_matrix[i][j] == 1]
            if not check_cluster(cluster):
                for j in cluster:
                    self.adjacency_matrix[i][j] = 0
                    self.adjacency_matrix[j][i] = 0
        
        return self.adjacency_matrix
        
    def create_adjacency_matrix(self, matrix, THRESHOLD):
        'build adjacency matrix of weighted matrix'

        self.adjacency_matrix = matrix.copy()
        self.adjacency_matrix[self.adjacency_matrix>THRESHOLD] = 1
        self.adjacency_matrix[self.adjacency_matrix<=THRESHOLD] = 0
                    
        return self.adjacency_matrix

def create_y(words: pd.Series, df: pd.DataFrame)->np.array:
    'create y as adjacency matrix'

    # Initialize empty array
    te = np.arange(len(words)*len(words))
    tes = np.reshape(te,[len(words), len(words)])
    y = np.zeros_like(tes)

    #Transform clusters to array
    for i in range(len(df)):
        cluster = df.loc[i]['cluster']
        neighbors = list(df[df['cluster']==cluster]['word'])
        neighbor_idxs = []
        for neighbor in neighbors:
            neighbor_idxs.append(words.index(neighbor))
            for start in neighbor_idxs:
                for goal in neighbor_idxs:
                    y[start][goal] = 1

    return y

def calculate_measures(y: np.array, adjacency_matrix: np.array)->tuple:
    'Calculate several measures'
    
    #Calculate Confusion Matrix
    diff_mat = y - adjacency_matrix
    for i in range(len(diff_mat)):
        diff_mat[i][i] = 0
    true_positives = (adjacency_matrix * y).sum()
    false_negatives = (diff_mat == 1).sum()
    false_positives = (diff_mat == -1).sum()

    #Calculate Precision and Recall
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    #Calculate F1-Score
    f1_score = 2 * ((precision * recall)/(precision + recall))

    #Calculate Fowlkes-Mallows Index
    fmi = math.sqrt(precision * recall)
    
    measure_dict = {'F1-Score': f1_score,
                    'FMI': fmi}

    return measure_dict

matr = np.array([cos_array, fuzz_array, sub_array, jac_array])
abc = name_clustering()
matrix = abc.matrix(matr, [0.25, 0.25, 0.25, 0.25])
adjacency_matrix = abc.create_adjacency_matrix(matrix, 0.5)

labels = {k: v for k, v in enumerate(unique_words)}

def draw_graph(G, labels):
    'draw a graph with specified labels and Parameters'
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, node_size=10)
    nx.draw_networkx_labels(G, pos, labels, font_size=8)
    nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.2)

G = nx.from_numpy_array(adjacency_matrix*matrix)
plt.figure(1, figsize=(20,20))
draw_graph(G, labels)
plt.show()

for idx, word in enumerate(unique_words):
    G.nodes[idx]['title'] = word
fig_first_model = gv.three(G, node_label_data_source='title')
fig_first_model

y = create_y(unique_words, df)
print(f'The FMI-Score of the first model is {calculate_measures(y, adjacency_matrix)['FMI']}')

The FMI-Score of the first model is 0.8542649104429728

class individual():

    def __init__(self, num_weights):
        self.thresh = np.random.randint(0,100)/100
        self.weights = []
        for i in range(num_weights):
            self.weights.append(np.random.randint(0,100))
        for i in range(num_weights):
            self.weights[i] /= sum(self.weights)
            self.weights[i] = round(self.weights[i],2)
        return
    
    def fitness(self, matr, y)->dict:
        'Calculate fitness of individuum'

        #Create adjacency matrix
        abc = name_clustering()
        matrix = abc.matrix(matr, self.weights)
        adjacency_matrix = abc.create_adjacency_matrix(matrix, self.thresh)

        #Calculate FMI-Score
        measures = calculate_measures(y, adjacency_matrix)
        if math.isnan(measures['FMI']):
            measures['FMI'] = 0
        
        return measures['FMI']
    
    def mutation_weight(self)->None:
        'Mutate the weights of the individuum'

        #Get random gene
        gene = np.random.randint(0,len(self.weights))

        #Mutate allel
        self.weights[gene] = np.random.random()

        #Normalize genes
        for i in range(len(self.weights)):
            self.weights[i] /= sum(self.weights)
            self.weights[i] = round(self.weights[i],2)
        
        return
    
    def mutation_treshold(self)->None:
        'Mutate threshold of individuum'

        self.thresh = np.random.randint(0,100)/100

        return
    
    def repair_weights(self)->None:
        'Repair weights of individuum that sum of weights = 1'

        for i in range(len(self.weights)):
            self.weights[i] /= sum(self.weights)
            self.weights[i] = round(self.weights[i],2)
        
        return

def create_population(pop_size: int, num_weights: int)->list:
    'Create initial population'

    population = []
    for i in range(pop_size):
        population.append(individual(num_weights))
    
    return population

def calculate_fitness(population: list, matr: np.array, y: np.array)->list[float]:
    'Calculate fitness for whole population'

    fitness = []
    for i in range(len(population)):
        fitness.append(population[i].fitness(matr, y))
    
    return fitness

def mutation(fitness: list, mut_prob: float, population: list)->None:
    'Mutate whole population'

    #Repeat mutation as indicated by mutation probability (`mut_prob`)
    for _ in range(int(len(population) * mut_prob)):

        #Get random individuum for mutation
        weight_random = np.random.randint(0,len(population))
        thresh_random = np.random.randint(0,len(population))

        #Mutate weight of individuum
        population[weight_random].mutation_weight()
        fitness[weight_random] = population[weight_random].fitness(matr, y)

        #Mutate threshold of individuum
        population[thresh_random].mutation_treshold()
        fitness[thresh_random] = population[thresh_random].fitness(matr, y)
    
    return

def roullete_wheel_build(population: list, pop_fit: list[float])->list[float]:
    'Build roullete wheel for selection process'

    cum_fit = []
    sum_fit = 0
    for i in range(len(pop_fit)):
        sum_fit += pop_fit[i] / sum(pop_fit)
        cum_fit.append(sum_fit)
    
    return cum_fit

def select(population: list, pop_fit: list[float]):
    'Select 2 individuals for recombination'

    lst = roullete_wheel_build(population, pop_fit)
    parent_1 = lst.index(next(x for x in lst if x > 0.5)) - 1
    parent_2 = lst.index(next(x for x in lst if x > 0.5)) - 1

    return parent_1, parent_2

def recombination(num_weights: int, parent_1: int, parent_2: int, pop_fit: list, matr: np.array, y: np.array, population: list, kid_population: list)->None:
    'Recombine two individuals into four children'

    #Initialize children
    kid_1 = individual(num_weights)
    kid_2 = individual(num_weights)
    kid_3 = individual(num_weights)
    kid_4 = individual(num_weights)

    #Define split of genes
    split = np.random.randint(0,len(population[0].weights))

    #Create first child, repair weights, add to population and calculate fitness
    kid_1.weights = population[parent_1].weights[:split] + population[parent_2].weights[split:]
    kid_1.repair_weights()
    kid_1.thresh = population[parent_1].thresh
    kid_population.append(kid_1)
    pop_fit.append(population[-1].fitness(matr, y))

    #Create second child, repair weights, add to population and calculate fitness
    kid_2.weights = population[parent_2].weights[:split] + population[parent_1].weights[split:]
    kid_2.repair_weights()
    kid_2.thresh = population[parent_1].thresh
    kid_population.append(kid_2)
    pop_fit.append(population[-1].fitness(matr, y))

    #Create third child, repair weights, add to population and calculate fitness
    kid_3.weights = population[parent_1].weights[:split] + population[parent_2].weights[split:]
    kid_3.repair_weights()
    kid_3.thresh = population[parent_2].thresh
    kid_population.append(kid_1)
    pop_fit.append(population[-1].fitness(matr, y))

    #Create fourth child, repair weights, add to population and calculate fitness
    kid_4.weights = population[parent_2].weights[:split] + population[parent_1].weights[split:]
    kid_4.repair_weights()
    kid_4.thresh = population[parent_2].thresh
    kid_population.append(kid_2)
    pop_fit.append(population[-1].fitness(matr, y))

    return

#Hyperparameters of Evolutionary Algorithm
pop_size = 15
num_weights = 4
generations = 30
recomb_rate = 10

#Initiate population
population = create_population(pop_size, num_weights)

#Create base for fitness calculation
matr = np.array([cos_array, fuzz_array, sub_array, jac_array])
y = create_y(unique_words, df)

best_fit = []
best_weight = []
best_thresh = []

for generation in range(generations):

    #Calculate Fitness
    pop_fit = calculate_fitness(population, matr, y)

    #Recombine Individuals
    kid_population = []
    kid_fit = []
    for i in range(recomb_rate):
        parent_1, parent_2 = select(population, pop_fit)
        recombination(4, parent_1, parent_2, kid_fit, matr, y, population, kid_population)
    
    #Mutate childs
    mutation(kid_fit, 0.3, kid_population)

    #Aggregate both populations
    population += kid_population
    pop_fit += kid_fit

    #Reduce population
    best_ind_idx = list(pd.DataFrame(pop_fit, columns=['fitness']).sort_values(by='fitness', ascending=False).head(pop_size).index)
    best_fit.append(pop_fit[best_ind_idx[0]])
    new_population = [population[idx] for idx in best_ind_idx]
    population = new_population
    best_weight.append(population[0].weights)
    best_thresh.append(population[0].thresh)

    print('Generation: ' + str(generation) + ', Best Fitness: ' + str(best_fit[-1]) + ', Threshold: ' + str(population[0].thresh) + ', Weights: ' + str(population[0].weights))

Generation: 0, Best Fitness: 0.7323346254241884, Threshold: 0.88, Weights: [0.11, 0.46, 0.4, 0.97]
Generation: 1, Best Fitness: 0.8558526958848512, Threshold: 0.04, Weights: [0.04, 0.23, 0.2, 0.68]
Generation: 2, Best Fitness: 0.8886639463026328, Threshold: 0.69, Weights: [0.02, 0.5, 0.16, 0.46]
Generation: 3, Best Fitness: 0.8886639463026328, Threshold: 0.69, Weights: [0.02, 0.5, 0.16, 0.46]
Generation: 4, Best Fitness: 0.8886639463026328, Threshold: 0.69, Weights: [0.02, 0.5, 0.16, 0.46]
Generation: 5, Best Fitness: 0.8886639463026328, Threshold: 0.69, Weights: [0.02, 0.5, 0.16, 0.46]
Generation: 6, Best Fitness: 0.8886639463026328, Threshold: 0.69, Weights: [0.02, 0.5, 0.16, 0.46]
Generation: 7, Best Fitness: 0.8890724569088491, Threshold: 0.69, Weights: [0.17, 0.36, 0.14, 0.42]
Generation: 8, Best Fitness: 0.8890724569088491, Threshold: 0.69, Weights: [0.17, 0.36, 0.14, 0.42]
Generation: 9, Best Fitness: 0.8890724569088491, Threshold: 0.69, Weights: [0.17, 0.36, 0.14, 0.42]
Generation: 10, Best Fitness: 0.8890724569088491, Threshold: 0.69, Weights: [0.17, 0.36, 0.14, 0.42]
Generation: 11, Best Fitness: 0.8890724569088491, Threshold: 0.69, Weights: [0.17, 0.36, 0.14, 0.42]
Generation: 12, Best Fitness: 0.8890724569088491, Threshold: 0.69, Weights: [0.17, 0.36, 0.14, 0.42]
Generation: 13, Best Fitness: 0.8890724569088491, Threshold: 0.69, Weights: [0.17, 0.36, 0.14, 0.42]
Generation: 14, Best Fitness: 0.8890724569088491, Threshold: 0.69, Weights: [0.17, 0.36, 0.14, 0.42]
Generation: 15, Best Fitness: 0.8890724569088491, Threshold: 0.69, Weights: [0.17, 0.36, 0.14, 0.42]
Generation: 16, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 17, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 18, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 19, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 20, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 21, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 22, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 23, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 24, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 25, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 26, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 27, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 28, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]
Generation: 29, Best Fitness: 0.8997855075417331, Threshold: 0.51, Weights: [0.01, 0.56, 0.23, 0.13]

x_scale = [x for x in range(1, generations + 1)]

plt.figure(figsize=[20,4])

plt.subplot(131)
plt.plot(x_scale, best_fit)
plt.xlabel('Generation')
plt.ylabel('FMI-Score')
plt.title('FMI-Score over Generations')
plt.annotate('Best FMI-Score: ' + str(round(best_fit[-1],3)), [generations,best_fit[-1]], xytext=[generations/2, best_fit[-1]-0.03], arrowprops=dict(facecolor='black', shrink=0.05))

plt.subplot(132)
plt.plot(x_scale, best_thresh)
plt.xlabel('Generation')
plt.ylabel('Threshold')
plt.title('Threshold over Generations')
plt.annotate('Best Threshold: ' + str(round(best_thresh[-1],3)), [generations,best_thresh[-1]], xytext=[generations/2, 0.5], arrowprops=dict(facecolor='black', shrink=0.05))

plt.subplot(133)
weight_cos = [ind[0] for ind in best_weight]
weight_fuzz = [ind[1] for ind in best_weight]
weight_sub = [ind[2] for ind in best_weight]
weight_jac = [ind[3] for ind in best_weight]
plt.plot(x_scale, weight_cos,
         x_scale, weight_fuzz,
         x_scale, weight_sub,
         x_scale, weight_jac)
plt.xlabel('Generation')
plt.ylabel('Weight')
plt.title('Weights over Generations')
plt.legend(['Cosine (' + str(round(best_weight[-1][0],2)) + ')', 'Fuzz (' + str(round(best_weight[-1][1],2)) + ')', 'Sub (' + str(round(best_weight[-1][2],2)) + ')', 'Jaccard (' + str(round(best_weight[-1][3],2)) + ')'])

plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

abc = name_clustering()
matrix = abc.matrix(matr, best_weight[-1])
adjacency_matrix = abc.create_adjacency_matrix(matrix, best_thresh[-1])

labels = {k: v for k, v in enumerate(unique_words)}

def draw_graph(G, labels):
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, node_size=10)
    nx.draw_networkx_labels(G, pos, labels, font_size=8)
    nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.2)

G = nx.from_numpy_array(adjacency_matrix*matrix)
plt.figure(1, figsize=(20,20))
draw_graph(G, labels)
plt.show()

for idx, word in enumerate(unique_words):
    G.nodes[idx]['title'] = word
fig_opt_model = gv.three(G, node_label_data_source='title')
fig_opt_model

X = df['word']
y = df['cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

train = pd.DataFrame(zip(X_train, y_train))
train.columns = ['word','cluster']

unique_words = list(set(train['word']))

text = normalize(unique_words)

cos_array = cos_sim(text)
fuzz_array = fuzz_ratio(text)
sub_array = sub_string(text)
jac_array = jaccard(text)

#Hyperparameters of Evolutionary Algorithm
pop_size = 15
num_weights = 4
generations = 30
recomb_rate = 10

#Initiate population
population = create_population(pop_size, num_weights)

#Create base for fitness calculation
matr = np.array([cos_array, fuzz_array, sub_array, jac_array])
y = create_y(unique_words, train)

best_fit = []
best_weight = []
best_thresh = []

for generation in range(generations):

    #Calculate Fitness
    pop_fit = calculate_fitness(population, matr, y)

    #Recombine Individuals
    kid_population = []
    kid_fit = []
    for i in range(recomb_rate):
        parent_1, parent_2 = select(population, pop_fit)
        recombination(4, parent_1, parent_2, kid_fit, matr, y, population, kid_population)
    
    #Mutate childs
    mutation(kid_fit, 0.3, kid_population)

    #Aggregate both populations
    population += kid_population
    pop_fit += kid_fit

    #Reduce population
    best_ind_idx = list(pd.DataFrame(pop_fit, columns=['fitness']).sort_values(by='fitness', ascending=False).head(pop_size).index)
    best_fit.append(pop_fit[best_ind_idx[0]])
    new_population = [population[idx] for idx in best_ind_idx]
    population = new_population
    best_weight.append(population[0].weights)
    best_thresh.append(population[0].thresh)

    print('Generation: ' + str(generation) + ', Best Fitness: ' + str(best_fit[-1]) + ', Threshold: ' + str(population[0].thresh) + ', Weights: ' + str(population[0].weights))

Generation: 0, Best Fitness: 0.8339075952110049, Threshold: 0.99, Weights: [0.1, 0.49, 0.06, 0.99]
Generation: 1, Best Fitness: 0.8959552565486787, Threshold: 0.27, Weights: [0.11, 0.2, 0.51, 0.1]
Generation: 2, Best Fitness: 0.8843984767376136, Threshold: 0.27, Weights: [0.13, 0.22, 0.55, 0.05]
Generation: 3, Best Fitness: 0.9005997027653069, Threshold: 0.61, Weights: [0.11, 0.51, 0.05, 0.22]
Generation: 4, Best Fitness: 0.9005997027653069, Threshold: 0.61, Weights: [0.11, 0.51, 0.05, 0.22]
Generation: 5, Best Fitness: 0.9005997027653069, Threshold: 0.61, Weights: [0.11, 0.51, 0.05, 0.22]
Generation: 6, Best Fitness: 0.9005997027653069, Threshold: 0.61, Weights: [0.11, 0.51, 0.05, 0.22]
Generation: 7, Best Fitness: 0.9047353466191334, Threshold: 0.31, Weights: [0.06, 0.38, 0.46, 0.09]
Generation: 8, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 9, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 10, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 11, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 12, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 13, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 14, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 15, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 16, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 17, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 18, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 19, Best Fitness: 0.902555070369506, Threshold: 0.38, Weights: [0.03, 0.4, 0.47, 0.09]
Generation: 20, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]
Generation: 21, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]
Generation: 22, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]
Generation: 23, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]
Generation: 24, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]
Generation: 25, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]
Generation: 26, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]
Generation: 27, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]
Generation: 28, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]
Generation: 29, Best Fitness: 0.9029708796917256, Threshold: 0.28, Weights: [0.06, 0.24, 0.61, 0.09]

x_scale = [x for x in range(1, generations + 1)]

plt.figure(figsize=[20,4])

plt.subplot(131)
plt.plot(x_scale, best_fit)
plt.xlabel('Generation')
plt.ylabel('FMI-Score')
plt.title('FMI-Score over Generations')
plt.annotate('Best FMI-Score: ' + str(round(best_fit[-1],3)), [generations,best_fit[-1]], xytext=[generations/2, best_fit[-1]-0.02], arrowprops=dict(facecolor='black', shrink=0.05))

plt.subplot(132)
plt.plot(x_scale, best_thresh)
plt.xlabel('Generation')
plt.ylabel('Threshold')
plt.title('Threshold over Generations')
plt.annotate('Best Threshold: ' + str(round(best_thresh[-1],3)), [generations,best_thresh[-1]], xytext=[generations/2, 0.6], arrowprops=dict(facecolor='black', shrink=0.05))

plt.subplot(133)
weight_cos = [ind[0] for ind in best_weight]
weight_fuzz = [ind[1] for ind in best_weight]
weight_sub = [ind[2] for ind in best_weight]
weight_jac = [ind[3] for ind in best_weight]
plt.plot(x_scale, weight_cos,
         x_scale, weight_fuzz,
         x_scale, weight_sub,
         x_scale, weight_jac)
plt.xlabel('Generation')
plt.ylabel('Weight')
plt.title('Weights over Generations')
plt.legend(['Cosine (' + str(round(best_weight[-1][0],2)) + ')', 'Fuzz (' + str(round(best_weight[-1][1],2)) + ')', 'Sub (' + str(round(best_weight[-1][2],2)) + ')', 'Jaccard (' + str(round(best_weight[-1][3],2)) + ')'])

plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

test = pd.DataFrame(zip(X_test, y_test))
test.columns = ['word','cluster']

unique_words = list(set(test['word']))

text = normalize(unique_words)

cos_array = cos_sim(text)
fuzz_array = fuzz_ratio(text)
sub_array = sub_string(text)
jac_array = jaccard(text)

matr = np.array([cos_array, fuzz_array, sub_array, jac_array])
abc = name_clustering()
matrix = abc.matrix(matr, best_weight[-1])
adjacency_matrix = abc.create_adjacency_matrix(matrix, best_thresh[-1])

labels = {k: v for k, v in enumerate(unique_words)}

def draw_graph(G, labels):
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, node_size=10)
    nx.draw_networkx_labels(G, pos, labels, font_size=8)
    nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.2)

G = nx.from_numpy_array(adjacency_matrix*matrix)
plt.figure(1, figsize=(20,20))
draw_graph(G, labels)
plt.show()

for idx, word in enumerate(unique_words):
    G.nodes[idx]['title'] = word
fig_test = gv.three(G, node_label_data_source='title')
fig_test

y = create_y(unique_words, test)
print(f'The FMI-Score of the test model is {calculate_measures(y, adjacency_matrix)['FMI']}')

The FMI-Score of the test model is 0.891998276885861

	word	cluster
0	Chasney	Chansey
1	Nidqueen	Nidoqueen
2	GAstly	Gastly
3	Vulpix	Vulpix
4	Gasly	Gastly

	word
cluster
Nidoqueen	1
Nidoranf	1
Weezing	2
Tauros	2
Seaking	2
...	...
Weepinbell	13
Horsea	13
Caterpie	14
Clefairy	14
Cloyster	14

1. Libraries¶

2. Read Data

3. Name Clustering¶

3.1 Similarity Metrics¶

3.1.1 Cosine Similarity¶

3.1.2 Fuzz Ratio¶

3.1.3 Sub String Clustering¶

3.1.4 Jaccard Similarity¶

3.2 Define Cluster-Class and Functions¶

4. First Model¶

5. Hyperparameter Optimization¶

5.1 Initialize Evolutionary Algorithm¶

5.2 Apply Evolutionary Algorithm¶

5.3 Visualize Optimization¶

5.4 Visualization of Clusters with optimised Parameters¶

6. Test with Train and Test split¶

6.1 Get X and y¶

6.2 Training¶

6.3 Test¶