项目作者: billsioros

项目描述 :
Twitter Sentiment Analysis
高级语言: Jupyter Notebook
项目地址: git://github.com/billsioros/twitter-sentiment-analysis.git
创建时间: 2019-03-30T15:40:30Z
项目社区:https://github.com/billsioros/twitter-sentiment-analysis

开源协议:MIT License

下载


Contributors

  • Sioros Vasileios (billsioros)
  • Konstantinos Kyriakos (Qwerkykk)

Notice

We are providing only a sample of the original datasets


Firstly, we define some utility functions and globally declare our train_set and test_set filenames

  1. filenames = ['..\\data\\train2017.tsv', '..\\data\\test2017.tsv']
  2. import os
  3. def platform_path(path):
  4. return os.path.abspath(os.path.expanduser(path))
  5. def platform_filename(filenames, tags):
  6. if not isinstance(filenames, list) or not isinstance(tags, list):
  7. raise ValueError("Both arguements must be instances of the 'list' object")
  8. filenames = sorted(set(map(platform_path, filenames)))
  9. filepath = []
  10. for filename in filenames:
  11. filepath.append(os.path.splitext(os.path.basename(filename))[0])
  12. filepath = '_'.join(filepath + tags)
  13. return os.path.join(os.path.curdir, 'out', filepath)

Let us implement our so called cruncher. It is an abstraction layer on top of the ‘lemmatizer’ and the ‘stemmer’, that the package ‘nltk’ provides and its API is quite similar

  1. from nltk.stem import WordNetLemmatizer
  2. from nltk.stem import PorterStemmer
  3. class Cruncher:
  4. def __init__(self, method='lemmatizer'):
  5. self.method = method
  6. if method == 'lemmatizer':
  7. self.underlying = WordNetLemmatizer()
  8. self.crunch = self.underlying.lemmatize
  9. elif method == 'stemmer':
  10. self.underlying = PorterStemmer()
  11. self.crunch = self.underlying.stem
  12. else:
  13. raise ValueError("'" + method + "' is not supported")

Secondly, we are going to implement our preprocessor. This module’s purpose is to preprocess the given tweets and extract only useful information out of them. To be more specific, the preprocessor’s role is to:

  1. Remove any non ascii characters (for example emojis)
  2. Remove any leading and trailing whitespace characters
  3. Convert every character to its lower case counterpart
  4. Remove any urls
  5. Remove any tags
  6. Remove any punctuation
  7. Tokenize tweet at hand and lemmatize each one of its tokens
  8. while removing any stopwords
  1. import os
  2. import sys
  3. import re
  4. import string
  5. from nltk import word_tokenize
  6. from nltk.corpus import stopwords
  7. class Preprocessor:
  8. valid_labels = { 'positive', 'negative', 'neutral', 'unknown' }
  9. urlregex = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
  10. tagregex = r'''@[^\s]+'''
  11. def __init__(self, filenames, cruncher, save=True):
  12. if not isinstance(cruncher, Cruncher):
  13. raise ValueError("'" + cruncher + "' is not an instance of 'Cruncher'")
  14. if not os.path.isdir('out'):
  15. os.mkdir('out')
  16. self.path = platform_filename(filenames, ['preprocessed'])
  17. self.labels = {}
  18. self.tweets = {}
  19. if os.path.isfile(self.path + '.tsv'):
  20. with open(self.path + '.tsv', mode='r', encoding='ascii') as file:
  21. tokenized_lines = [word_tokenize(line) for line in file.readlines()]
  22. counts = dict(zip(self.valid_labels, [0] * len(self.valid_labels)))
  23. for line in tokenized_lines:
  24. id, label, tokens = line[0], line[1], line[2:]
  25. if tokens:
  26. self.tweets[id] = tokens
  27. self.labels[id] = label
  28. counts[label] += 1
  29. for label, count in counts.items():
  30. print('<LOG>: Loaded', str(count).rjust(5), ("'" + label + "'").ljust(max(map(len, self.valid_labels)) + 2), 'tweets from', self.path + '.tsv', file=sys.stderr)
  31. return
  32. for filename in filenames:
  33. with open(filename, mode='r', encoding='utf8') as file:
  34. print('<LOG>: Processing', ("'" + filename + "'").ljust(max(map(len, filenames)) + 2), file=sys.stderr)
  35. lines = file.readlines()
  36. ignore = set(stopwords.words('english'))
  37. counts = dict(zip(self.valid_labels, [0] * len(self.valid_labels)))
  38. for line in lines:
  39. # Remove any non ascii characters (for example emojis)
  40. line = line.encode('ascii', 'ignore').decode('ascii')
  41. # Remove any leading and trailing whitespace characters
  42. line = line.strip()
  43. # Convert every character to its lower case counterpart
  44. line = line.lower()
  45. # Remove any urls
  46. line = re.sub(self.urlregex, '', line)
  47. # Remove any tags
  48. line = re.sub(self.tagregex, '', line)
  49. # Remove any punctuation
  50. line = line.translate(str.maketrans('', '', string.punctuation))
  51. # Tokenize tweet at hand and lemmatize each one of its tokens
  52. # while removing any stopwords
  53. tokens = word_tokenize(line)
  54. tokens = [cruncher.crunch(token) for token in tokens if token not in ignore]
  55. if tokens[2] in self.valid_labels:
  56. id, label, tokens = tokens[0], tokens[2], tokens[3:]
  57. if tokens:
  58. self.tweets[id] = tokens
  59. self.labels[id] = label
  60. counts[label] += 1
  61. else:
  62. raise ValueError("'" + label + "' is not a valid label")
  63. for label, count in counts.items():
  64. print('<LOG>: Saving', str(count).rjust(5), ("'" + label + "'").ljust(max(map(len, self.valid_labels)) + 2), 'tweets to', self.path + '.tsv', file=sys.stderr)
  65. if save:
  66. with open(self.path + '.tsv', 'w', encoding='ascii') as file:
  67. file.write('\n'.join([id + '\t' + self.labels[id] + '\t' + ' '.join(tweet) for id, tweet in self.tweets.items()]))
  68. def by_label(self, labels):
  69. labels = set(labels)
  70. for label in labels:
  71. if label not in self.valid_labels:
  72. raise ValueError("'" + label + "' is not a valid label")
  73. return { label: [(id, self.tweets[id]) for id, label in self.labels.items() if label in labels] }

We have different dictionaries in our desposal, which record the sentimental value, otherwise known as ‘valence’, so we need to utilize them in order to sentimentally categorize the tokens, which make up the processed tweets. To be more specific, the ‘Dictionary’ class’ constructor is supplied with the path of directory. Then it recursively searches the directory for text files. Each of the these files, whose format is expected to be of the format ‘\\\, are scanned and the valences of each word in every one of those is being recorded into a dictionary after being converted to the range [0, 1].

We had some difficulties with the dictionaries supplied, as most of them were not of the format mentioned above. Some had multiple words, even sentences on the same line, others had multiple occurences of the same word etc.

  1. import os
  2. import sys
  3. import pickle
  4. import numpy as np
  5. class Dictioanry:
  6. filename = os.path.join(os.path.curdir, 'out', 'dictionary.pkl')
  7. @staticmethod
  8. def convert(value, range_src, range_dst):
  9. min_src, max_src = range_src
  10. min_dst, max_dst = range_dst
  11. return min_dst + (((value - min_src) * (max_dst - min_dst)) / (max_src - min_src))
  12. def __init__(self, root, duplicate_weight=0.5, save=True):
  13. if os.path.isfile(self.filename):
  14. with open(self.filename, mode='rb') as file:
  15. print('<LOG>: Loading word valences from', self.filename, file=sys.stderr)
  16. self.relpaths, self.valences = pickle.load(file)
  17. for i in range(len(self.relpaths)):
  18. elements = [values[i] for values in self.valences.values()]
  19. print('<LOG>:', 'The normalized valences of', os.path.basename(self.relpaths[i]).ljust(max(map(lambda path: len(os.path.basename(path)), self.relpaths))), 'are in the range', '[' + '{0:+.4f}'.format(min(elements)), ',', '{0:+.4f}'.format(max(elements)) + ']', file=sys.stderr)
  20. return
  21. if duplicate_weight < 0.0 or duplicate_weight > 1.0:
  22. raise ValueError("'duplicate_weight' must be a value in the range [0.0, 1.0]")
  23. self.relpaths = []
  24. for directory, _, filenames in os.walk(platform_path(root)):
  25. for filename in filenames:
  26. self.relpaths.append(os.path.join(root, directory, filename))
  27. self.valences = {}
  28. for index, fullpath in enumerate(self.relpaths):
  29. valences = {}
  30. with open(fullpath, mode='r', encoding='ascii', errors='ignore') as file:
  31. for line in file.readlines():
  32. line = line.strip().split()
  33. words, valence = line[:-1], float(line[-1])
  34. for word in words:
  35. if word not in valences:
  36. valences[word] = valence
  37. else:
  38. valences[word] = duplicate_weight * valences[word] + (1.0 - duplicate_weight) * valence
  39. for word, valence in valences.items():
  40. if word not in self.valences:
  41. self.valences[word] = [0.0] * len(self.relpaths)
  42. self.valences[word][index] = valence
  43. valence_min = np.min(list(self.valences.values()))
  44. valence_max = np.max(list(self.valences.values()))
  45. print('<LOG>:', 'The valences of', os.path.basename(fullpath).ljust(max(map(lambda path: len(os.path.basename(path)), self.relpaths))), 'are in the range', '[' + '{0:+.4f}'.format(valence_min), ',', '{0:+.4f}'.format(valence_max) + ']', file=sys.stderr)
  46. for word in self.valences.keys():
  47. for index, value in enumerate(list(self.valences[word])):
  48. self.valences[word][index] = self.convert(value, (valence_min, valence_max), (-1, 1))
  49. if save:
  50. if not os.path.isdir('out'):
  51. os.mkdir('out')
  52. with open(self.filename, mode='wb') as file:
  53. pickle.dump((self.relpaths, self.valences), file)
  54. print('<LOG>: Saved word valences to', self.filename, file=sys.stderr)
  55. def per_tweet(self, tweets, vector_range):
  56. valences = [[0.0] * len(self.relpaths)] * len(tweets)
  57. for i, tweet in enumerate(tweets):
  58. for j in range(len(self.relpaths)):
  59. valences[i][j] = np.mean([self.convert(self.valences[token][j], (-1, 1), vector_range) for token in tweet if token in self.valences])
  60. return valences

We now present the ‘Vectorizer’ class, whose purpose is to convert the processed tweets into vectors. We firstly need to supply a method of vectorization among ‘bag-of-words’, ‘tf-idf’, ‘word-2-vec’.

Bag-of-Words/Tf-idf: Returns the produced vectors
Word to vec: For each tweet’s word calculates the average value, if the word exists in a dictionary adds the valence to the vector. In case there is dublicate words applies linear interpolation between the old and the new valence

  1. import sys
  2. import os
  3. import re
  4. import pickle
  5. from sklearn.feature_extraction.text import CountVectorizer
  6. from sklearn.feature_extraction.text import TfidfVectorizer
  7. from gensim.models import Word2Vec
  8. import numpy as np
  9. class Vectorizer:
  10. vector_size = 300
  11. bowargs = {
  12. "max_features": vector_size,
  13. "stop_words" : 'english',
  14. "max_df" : 0.5,
  15. "min_df" : 0.01
  16. }
  17. tfidfargs = {
  18. "max_df" : 1.0,
  19. "min_df" : 1,
  20. "max_features" : vector_size,
  21. "stop_words" : 'english'
  22. }
  23. w2vargs = {
  24. "size" : vector_size,
  25. "window" : 5,
  26. "min_count" : 2,
  27. "sg" : 1,
  28. "hs" : 0,
  29. "negative" : 10,
  30. "workers" : 2,
  31. "seed" : 34
  32. }
  33. supported_methods = { 'word2vec', 'bagofwords', 'tfidf' }
  34. def __init__(self, method='word2vec'):
  35. self.method = re.sub(r'''_|-|\ ''', '', method)
  36. if self.method == 'word2vec':
  37. self.underlying = Word2Vec(**self.w2vargs)
  38. elif self.method == 'bagofwords':
  39. self.underlying = CountVectorizer(**self.bowargs)
  40. elif self.method == 'tfidf':
  41. self.underlying = TfidfVectorizer(**self.tfidfargs)
  42. else:
  43. raise ValueError("'" + self.method + "' is not supported")
  44. def vectorize(self, preprocessor, dictionary, save=True):
  45. if isinstance(preprocessor, list):
  46. path = platform_filename(preprocessor, ['preprocessed', self.method] + (['augmented'] if dictionary else [])) + '.pkl'
  47. if not os.path.isfile(path):
  48. raise ValueError("'" + path + "' is not a file")
  49. with open(path, 'rb') as file:
  50. labels, vectors = pickle.load(file)
  51. print('<LOG>: Loaded', len(vectors), 'vectors from', path, '[' + str(len(list(vectors.values())[0])), 'features each]', file=sys.stderr)
  52. return dict(zip(vectors.keys(), labels)), vectors
  53. path = '_'.join([preprocessor.path, self.method] + (['augmented'] if dictionary else [])) + '.pkl'
  54. if not isinstance(preprocessor, Preprocessor):
  55. raise ValueError("'preprocessor' is not an instance of 'Preprocessor'")
  56. return self.process(preprocessor, dictionary, path if save else None)
  57. def process(self, preprocessor, dictionary, path):
  58. tweets = list(preprocessor.tweets.values())
  59. if self.method == 'word2vec':
  60. self.underlying.build_vocab(tweets)
  61. self.underlying.train(sentences=tweets, total_examples=len(tweets), epochs=20)
  62. vectors = [None] * len(tweets)
  63. for i, tweet in enumerate(tweets):
  64. vector = [None] * len(tweet)
  65. for j, token in enumerate(tweet):
  66. if token in self.underlying.wv:
  67. vector[j] = self.underlying.wv[token]
  68. else:
  69. vector[j] = 2.0 * np.random.randn(self.vector_size) - 1.0
  70. vectors[i] = np.mean(vector, axis=0)
  71. else:
  72. concatenated = [' '.join(tweet) for tweet in tweets]
  73. vectors = self.underlying.fit_transform(concatenated).toarray()
  74. if dictionary:
  75. flattened = list(np.asarray(vectors).flatten())
  76. vmin, vmax = min(flattened), max(flattened)
  77. augmented = [None] * len(vectors)
  78. for i, valences in enumerate(dictionary.per_tweet(tweets, (vmin, vmax))):
  79. augmented[i] = np.concatenate((vectors[i], valences))
  80. vectors = augmented
  81. print('<LOG>: The', ('augmented ' if augmented else '') + 'vectors\' values are in the range', '[' + '{0:.4f}'.format(vmin), ',', '{0:.4f}'.format(vmax) + ']', file=sys.stderr)
  82. vectors = dict(zip(preprocessor.tweets.keys(), vectors))
  83. if path:
  84. with open(path, 'wb') as file:
  85. pickle.dump((list(preprocessor.labels.values()), vectors), file)
  86. print('<LOG>: Saved', len(vectors), 'vectors to', path, '[' + str(len(list(vectors.values())[0])), 'features each]', file=sys.stderr)
  87. return preprocessor.labels, vectors

The ‘Visualizer’ class is responsible for the visualization of our data. The visualization methods currently supported are ‘bar_plot’, ‘word_cloud’, ‘tsne’, ‘heat_map’

  1. from collections import Counter
  2. import matplotlib.pyplot as plt
  3. import pandas as pd
  4. from sklearn.manifold import TSNE
  5. from wordcloud import WordCloud, STOPWORDS
  6. import numpy as np
  7. from gensim.models import Word2Vec
  8. np.random.seed(19680801)
  9. class Visualizer:
  10. supported_methods = { 'word_cloud', 'bar_plot', 'tsne', 'heat_map' }
  11. def __init__(self, preprocessor):
  12. if not isinstance(preprocessor, Preprocessor):
  13. raise ValueError("'preprocessor' is not an instance of 'Preprocessor'")
  14. self.preprocessor = preprocessor
  15. def visualize(self, labels=Preprocessor.valid_labels, method='word_cloud', dictionary=None, model=None, max_words=300):
  16. tokens = []
  17. for _, tweets in self.preprocessor.by_label(labels).items():
  18. for _, tweet in tweets:
  19. tokens += [token for token in tweet]
  20. if method == 'word_cloud':
  21. self.word_cloud(tokens)
  22. elif method == 'bar_plot':
  23. self.bar_plot(tokens)
  24. elif method == 'tsne':
  25. self.tsne(model, max_words)
  26. elif method == 'heat_map':
  27. self.heat_map(tokens, dictionary)
  28. else:
  29. raise ValueError("'" + method + "' is not supported")
  30. @staticmethod
  31. def bar_plot(tokens):
  32. count = Counter(tokens)
  33. dataFrame = pd.DataFrame(data=count.most_common(50), columns=['Word', 'Count'])
  34. dataFrame.plot.bar(x='Word',y='Count',figsize = (20,10))
  35. @staticmethod
  36. def word_cloud(tokens):
  37. wordcloud = WordCloud(width = 1200, height = 1200,
  38. background_color ='white',
  39. stopwords = set(STOPWORDS),
  40. min_font_size = 14).generate(' '.join(tokens))
  41. plt.figure(figsize = (8, 8), facecolor = None)
  42. plt.imshow(wordcloud)
  43. plt.axis("off")
  44. plt.tight_layout(pad = 0)
  45. plt.show()
  46. @staticmethod
  47. def tsne(model, max_words):
  48. if not isinstance(model, Word2Vec):
  49. raise ValueError("'model' is not an instance of 'Word2Vec'")
  50. if not isinstance(max_words, int) or max_words <= 0:
  51. raise ValueError("'max_words' must have an integer value greater than 0")
  52. labels = []
  53. tokens = []
  54. counter = 0
  55. for word in model.wv.vocab:
  56. tokens.append(model.wv[word])
  57. labels.append(word)
  58. counter +=1
  59. if counter == max_words:
  60. break
  61. tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=5000, random_state=23,)
  62. new_values = tsne_model.fit_transform(tokens)
  63. x = []
  64. y = []
  65. for value in new_values:
  66. x.append(value[0])
  67. y.append(value[1])
  68. plt.figure(figsize=(16, 16))
  69. for i in range(len(x)):
  70. plt.scatter(x[i],y[i])
  71. plt.annotate(labels[i],
  72. xy=(x[i], y[i]),
  73. xytext=(5, 2),
  74. textcoords='offset points',
  75. ha='right',
  76. va='bottom')
  77. plt.show()
  78. @staticmethod
  79. def heat_map(tokens, dictioanry):
  80. if not isinstance(dictioanry, Dictioanry):
  81. raise ValueError("'dictioanry' is not an instance of 'Dictioanry")
  82. rgb = lambda valence: (255 + Dictioanry.convert(valence, (-1, 1), (0, 255)) + 255) / 3
  83. x, y, c = [], [], []
  84. for token in tokens:
  85. if token in dictioanry.valences:
  86. x.append(np.random.rand())
  87. y.append(np.random.rand())
  88. c.append(rgb(np.mean(dictioanry.valences[token])))
  89. plt.scatter(x, y, c=c, alpha=0.8)
  90. plt.show()
  91. preprocessor = Preprocessor(filenames, Cruncher())
  92. dictionary = Dictioanry('..\\..\\lexica')
  93. vectorizer = Vectorizer()
  94. labels, vectors = vectorizer.vectorize(preprocessor, dictionary)
  95. visualizer = Visualizer(preprocessor)
  1. <LOG>: Processing '..\data\train2017.tsv'
  2. <LOG>: Saving 11517 'positive' tweets to .\out\test2017_train2017_preprocessed.tsv
  3. <LOG>: Saving 0 'unknown' tweets to .\out\test2017_train2017_preprocessed.tsv
  4. <LOG>: Saving 4476 'negative' tweets to .\out\test2017_train2017_preprocessed.tsv
  5. <LOG>: Saving 12067 'neutral' tweets to .\out\test2017_train2017_preprocessed.tsv
  6. <LOG>: Processing '..\data\test2017.tsv'
  7. <LOG>: Saving 0 'positive' tweets to .\out\test2017_train2017_preprocessed.tsv
  8. <LOG>: Saving 12282 'unknown' tweets to .\out\test2017_train2017_preprocessed.tsv
  9. <LOG>: Saving 0 'negative' tweets to .\out\test2017_train2017_preprocessed.tsv
  10. <LOG>: Saving 0 'neutral' tweets to .\out\test2017_train2017_preprocessed.tsv
  11. <LOG>: Saved word valences to .\out\dictionary.pkl
  12. <LOG>: The augmented vectors' values are in the range [-7.6609 , 5.6866]
  13. <LOG>: Saved 40341 vectors to .\out\test2017_train2017_preprocessed_word2vec_augmented.pkl [300 features each]
  1. visualizer.visualize(method='word_cloud', dictionary=dictionary, model=vectorizer.underlying)

png

  1. visualizer.visualize(method='tsne', dictionary=dictionary, model=vectorizer.underlying)

png

  1. visualizer.visualize(method='bar_plot', dictionary=dictionary, model=vectorizer.underlying)

png

The classifiers already provided by the python module ‘sklearn’ have been merged into one ‘Classifier’, which offers an abstraction layer over the two.

  1. from sklearn import svm
  2. from sklearn.neighbors import KNeighborsClassifier
  3. class Classifier:
  4. def __init__(self, vectors, labels, method='svm'):
  5. if method == 'svm':
  6. self.underlying = svm.SVC(kernel='sigmoid', gamma='scale', C=1, probability=True)
  7. elif method == 'knn':
  8. self.underlying = KNeighborsClassifier(n_neighbors=100)
  9. else:
  10. raise ValueError("'" + method + "' is not supported")
  11. self.underlying.fit(vectors, labels)
  12. def predict(self, unknown):
  13. return self.underlying.predict(unknown)
  14. def predic_proba(self, unknown):
  15. return self.underlying.predict_proba(unknown)

The ‘Round-Robin’ classifier has been implemented seperately.

Creates combinations for the ‘positive’,’negative’,’neutral’ label
For each combination creates a knn and train it with the correspondingly conbination’s labels
Predicts the propability for the whole train/test-set and combines the prediction result for each combination
At the end creates a knn, trains it with the summed-up results of the previous classifiers and predict the labels for the summed-up test-vectors

  1. import numpy as np
  2. from itertools import combinations
  3. from sklearn.neighbors import KNeighborsClassifier
  4. class RoundRobin:
  5. def __init__(self,labels,labeledVector,unknownVector):
  6. self.comb = combinations(['positive','negative','neutral'], 2)
  7. self.labels = labels
  8. self.totalTrainSet = labeledVector
  9. self.totalTestSet = unknownVector
  10. def classify(self):
  11. finalTestSet = []
  12. finalTrainSet = []
  13. for combination in self.comb:
  14. prediction = self.RR_knn(combination,self.labels,self.totalTrainSet,self.totalTestSet, subProblem = True)
  15. if len(finalTrainSet) == 0:
  16. finalTrainSet = prediction[0]
  17. finalTestSet = prediction[1]
  18. else:
  19. finalTrainSet = self.appendPrediction(finalTrainSet,prediction[0])
  20. finalTestSet = self.appendPrediction(finalTestSet,prediction[1])
  21. finalPrediction = self.RR_knn(['positive','negative','neutral'],self.labels,finalTrainSet,finalTestSet, subProblem = False)
  22. return finalPrediction
  23. def RR_knn(self,combination,labels,totalTrainSet,totalTestSet, subProblem = False):
  24. iris_X = []
  25. iris_Y = []
  26. for label in labels:
  27. if label in combination:
  28. iris_X.append(totalTrainSet[labels.index(label)])
  29. iris_Y.append(label)
  30. knn = KNeighborsClassifier(n_neighbors=100)
  31. knn.fit(iris_X,iris_Y)
  32. if subProblem == True:
  33. prediction = [knn.predict_proba(totalTrainSet),knn.predict_proba(totalTestSet)]
  34. else:
  35. prediction = knn.predict(totalTestSet)
  36. return prediction
  37. def appendPrediction(self,set, prediction):
  38. newSet = []
  39. for i in range(len(set)):
  40. newSet.append(np.concatenate([set[i],prediction[i]]))
  41. return newSet

Let us now define an ‘Evaluator’ class, an abstraction layer over the ‘f1-score’ and the ‘accuracy-score’ metrics

  1. import re
  2. from sklearn.metrics import f1_score, accuracy_score
  3. import numpy as np
  4. class Evaluator:
  5. def __init__(self, filename='../data/SemEval2017_task4_subtaskA_test_english_gold.txt'):
  6. with open(filename, mode='r', encoding='ascii', errors='ignore') as file:
  7. self.results = {}
  8. for line in file.readlines():
  9. id, result = line.split()
  10. self.results[id] = result
  11. def evaluate(self, unknown, method='f1score'):
  12. method = re.sub(r'''_|-|\ ''', '', method)
  13. if not isinstance(unknown, dict):
  14. raise ValueError("'unknown' is not an instance of 'dict'")
  15. facts = [self.results[id] for id in unknown.keys()]
  16. preds = list(unknown.values())
  17. if method == 'f1score':
  18. return f1_score(facts, preds, average='weighted', labels=np.unique(preds))
  19. elif method == 'accuracyscore':
  20. return accuracy_score(facts, preds, normalize=True)
  21. else:
  22. raise ValueError("'" + method + "' is not supported")

The following is the master script, which serves as a manager of everything mentioned this far.

If the vectors exists already on disc loads them, otherwise calculates the vectors from scratch.
Seperates the vectors of train-set and test-set and calls the classifiers to train the models and predict the results.

The last loop repeats this procedure for each vectorization method

  1. import sys
  2. import time
  3. def visualization(train_filename, cruncher_type='lemmatizer'):
  4. preprocessor = Preprocessor([train_filename], Cruncher(cruncher_type))
  5. return Visualizer(preprocessor)
  6. def evaluation(filenames, dictionary_root='../lexica', cruncher_type='lemmatizer', vectorizer_type='word2vec', metrics = ['f1-score', 'accuracy-score']):
  7. if not isinstance(filenames, list):
  8. raise ValueError("'" + filenames + "' is not an instance of 'list'")
  9. beg = time.time()
  10. vectorizer = Vectorizer(vectorizer_type)
  11. try:
  12. labels, vectors = vectorizer.vectorize(filenames, dictionary_root)
  13. except:
  14. preprocessor = Preprocessor(filenames, Cruncher(cruncher_type))
  15. dictionary = Dictioanry(dictionary_root) if dictionary_root else None
  16. labels, vectors = vectorizer.vectorize(preprocessor, dictionary)
  17. test_ids, test_labels, test_vectors = [], [], []
  18. train_ids, train_labels, train_vectors = [], [], []
  19. for id, label in labels.items():
  20. if label == 'unknown':
  21. test_ids.append(id)
  22. test_labels.append(label)
  23. test_vectors.append(vectors[id])
  24. else:
  25. train_ids.append(id)
  26. train_labels.append(label)
  27. train_vectors.append(vectors[id])
  28. evaluator = Evaluator()
  29. for classifing in ['knn', 'rrb', 'svm']:
  30. if classifing != 'rrb':
  31. classifier = Classifier(train_vectors, train_labels, classifing)
  32. predictions = classifier.predict(test_vectors)
  33. else:
  34. classifier = RoundRobin(train_labels,train_vectors,test_vectors)
  35. predictions = classifier.classify()
  36. for metric in metrics:
  37. value = evaluator.evaluate(dict(zip(test_ids, predictions)), metric)
  38. print('<LOG>: The performance of', "'" + classifing + "'", 'according to the', ("'" + metric + "'").ljust(max(map(len, metrics)) + 2), "metric is", '{0:.6f}'.format(value))
  39. end = time.time()
  40. print('\n\nElapsed time:', '{0:.6f}'.format(end - beg), 'seconds', file=sys.stderr)
  41. for vectorizer_type in Vectorizer.supported_methods:
  42. evaluation(filenames, vectorizer_type=vectorizer_type)
  1. <LOG>: Loaded 40341 vectors from .\out\test2017_train2017_preprocessed_bagofwords_augmented.pkl [80 features each]
  2. <LOG>: The performance of 'knn' according to the 'f1-score' metric is 0.377232
  3. <LOG>: The performance of 'knn' according to the 'accuracy-score' metric is 0.497150