# # This file contains a bunch of code that computes the inter annotator # agreement using the annotations made during lesson. Various methods # are used, some only work for 2 annotators, and some for multiple # annotators. In any case, related documentation to the wikipedia page # describing the methods is reported. # # Here follows some data found by executing the script. Note that the # various IDs used correspond to the sheetID. # # # Made by Leonardo Tamiano on 24/10/19. # # ------------------ Imports --------------------------------------- import gspread from oauth2client.service_account import ServiceAccountCredentials from pprint import pprint from itertools import combinations import numpy as np scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets', "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"] creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope) client = gspread.authorize(creds) sh = client.open("Annotazione") ANNOTATION_CLASSES = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"] # Data taken from google sheet ROWS = [ [u'ADV', u'VERB', u'ADV', u'ADJ', u'SCONJ', u'DET', u'NOUN', u'DET', u'NOUN', u'ADP', u'NOUN', u'ADJ', u'ADP', u'PROPN', u'PROPN', u'PROPN', u'VERB', u'ADJ', u'SYM', u'CCONJ', u'SCONJ', u'VERB', u'DET', u'ADJ', u'NOUN', u'SCONJ', u'VERB', u'AUX', u'ADJ', u'ADP', u'NOUN', u'SYM', u'PUNCT', u'ADV', u'SCONJ', u'VERB', u'ADJ', u'VERB', u'SCONJ', u'ADV', u'ADJ', u'NOUN', u'ADP', u'ADJ', u'NOUN', u'CCONJ', u'DET', u'NOUN', u'ADV', u'VERB', u'ADJ', u'DET', u'NOUN', u'CCONJ', u'DET', u'NOUN', u'PROPN', u'PART', u'PROPN', u'PUNCT'], [u'part', u'aux', u'adv', u'adj', u'sconj', u'det', u'noun', u'adp', u'noun', u'adp', u'noun', u'adj', u'adp', u'propn', u'propn', u'propn', u'aux', u'adj', u'sym', u'cconj', u'cconj', u'verb', u'det', u'adj', u'noun', u'sconj', u'verb', u'verb', u'verb', u'adp', u'propn', u'sym', u'sym', u'sconj', u'sconj', u'verb', u'adj', u'verb', u'adp', u'adp', u'adj', u'noun', u'', u'adj', u'noun', u'cconj', u'adp', u'noun', u'adv', u'aux', u'adj', u'adp', u'noun', u'cconj', u'adp', u'propn', u'propn', u'propn', u'propn', u'sym'], [u'CCONJ', u'VERB', u'ADV', u'ADJ', u'SCONJ', u'DET', u'NOUN', u'ADP', u'NOUN', u'ADP', u'NOUN', u'ADJ', u'ADP', u'PROPN', u'PROPN', u'PROPN', u'AUX', u'VERB', u'SYM', u'SCONJ', u'CCONJ', u'VERB', u'DET', u'ADJ', u'NOUN', u'SCONJ', u'AUX', u'VERB', u'ADJ', u'ADP', u'PROPN', u'SYM', u'PUNCT', u'ADJ', u'ADV', u'AUX', u'VERB', u'VERB', u'PRON', u'ADV', u'ADJ', u'NOUN', u'ADP', u'ADJ', u'NOUN', u'ADP', u'DET', u'NOUN', u'ADV', u'AUX', u'VERB', u'DET', u'NOUN', u'ADP', u'DET', u'NOUN', u'PROPN', u'PROPN', u'PROPN', u'PUNCT'], [u'ADV', u'VERB', u'ADV', u'ADJ', u'CCONJ', u'DET', u'NOUN', u'ADP', u'NOUN', u'CCONJ', u'NOUN', u'ADJ', u'ADP', u'PROPN', u'PROPN', u'PROPN', u'VERB', u'ADJ', u'PUNCT', u'ADV', u'CCONJ', u'VERB', u'DET', u'ADJ', u'NOUN', u'CCONJ', u'VERB', u'VERB', u'VERB', u'ADP', u'PROPN', u'PUNCT', u'PUNCT', u'PRON', u'CCONJ', u'VERB', u'ADJ', u'VERB', u'CCONJ', u'ADJ', u'ADJ', u'NOUN', u'ADP', u'ADJ', u'NOUN', u'ADP', u'DET', u'NOUN', u'ADV', u'VERB', u'ADJ', u'DET', u'NOUN', u'ADP', u'DET', u'PROPN', u'PROPN', u'PROPN', u'PROPN', u'PUNCT'], [u'ADV', u'VERB', u'ADV', u'ADJ', u'SCONJ', u'DET', u'NOUN', u'ADP', u'NOUN', u'ADP', u'NOUN', u'ADJ', u'ADP', u'PROPN', u'PROPN', u'PROPN', u'VERB', u'ADJ', u'SYM', u'SCONJ', u'SCONJ', u'VERB', u'DET' , u'ADJ', u'NOUN', u'SCONJ', u'AUX', u'AUX', u'VERB', u'ADP', u'NOUN', u'SYM', u'PUNCT', u'PRON', u'SCONJ', u'VERB', u'ADJ', u'VERB', u'SCONJ', u'ADV', u'PRON', u'NOUN', u'ADP', u'ADJ', u'NOUN', u'ADP', u'DET', u'NOUN', u'NOUN', u'VERB', u'ADJ', u'DET', u'NOUN', u'ADP', u'DET', u'NOUN', u'PROPN', u'PROPN', u'PROPN'], [u'AUX', u'VERB', u'ADJ', u'NOUN', u'SCONJ', u'DET', u'NOUN', u'ADP', u'NOUN', u'ADP', u'NOUN', u'ADJ', u'ADP', u'PROPN', u'PROPN', u'PROPN', u'VERB', u'NOUN', u'PUNCT', u'SCONJ', u'SCONJ', u'VERB', u'DET', u'ADJ', u'NOUN', u'ADV', u'AUX', u'VERB', u'VERB', u'ADP', u'PROPN', u'PUNCT', u'PUNCT', u'DET', u'SCONJ', u'VERB', u'ADV', u'VERB', u'SCONJ', u'ADV', u'PRON', u'NOUN', u'ADP', u'ADV', u'NOUN', u'ADP', u'DET', u'NOUN', u'ADV', u'VERB', u'NOUN', u'DET', u'NOUN', u'ADP', u'DET', u'NOUN', u'PROPN', u'PROPN', u'PROPN', u'PUNCT'], [u'ADV', u'AUX', u'ADJ', u'NOUN', u'SCONJ', u'DET', u'NOUN', u'ADP', u'NOUN', u'ADP', u'NOUN', u'ADJ', u'ADP', u'PROPN', u'PROPN', u'PROPN', u'AUX', u'NOUN', u'PUNCT', u'CCONJ', u'SCONJ', u'VERB', u'DET', u'ADJ', u'NOUN', u'SCONJ', u'AUX', u'VERB', u'ADV', u'ADP', u'PROPN', u'PUNCT', u'PUNCT', u'PRON', u'SCONJ', u'AUX', u'ADV', u'AUX', u'SCONJ', u'ADV', u'ADV', u'NOUN', u'ADP', u'ADJ', u'NOUN', u'ADP', u'DET', u'NOUN', u'ADV', u'AUX', u'NOUN', u'DET', u'NOUN', u'ADP', u'DET', u'PROPN', u'PROPN', u'PART', u'PROPN', u'PUNCT'] ] # ------------------ Agreement for 2 annotators --------------------------------------- def basic_agreement_from_spreadsheet(i, j, agreement_function): # get data from google spreadsheet wsheet1 = sh.get_worksheet(i - 1) row11 = wsheet1.row_values(2) row12 = wsheet1.row_values(4) wsheet2 = sh.get_worksheet(j - 1) row21 = wsheet2.row_values(2) row22 = wsheet2.row_values(4) return basic_agreement(row11 + row12, row21 + row22, ANNOTATION_CLASSES, agreement_function) def basic_agreement(row1, row2, classes, agreement_function): # annotator statistics: for each annotetor and for each class we # count the number of time that annotator has choosen that class. antr_1_stats = {} antr_2_stats = {} accuracy = 0 terms = 0 for c in classes: antr_1_stats[c] = 0 antr_2_stats[c] = 0 # check annotations for i in range(0, min(len(row1), len(row2))): annotation_1 = row1[i].upper() annotation_2 = row2[i].upper() # check if they both have annotated that word. If not, do not # consider it. if annotation_1 != "" and annotation_2 != "": if annotation_1 == annotation_2: accuracy += 1 antr_1_stats[annotation_1] += 1 antr_2_stats[annotation_2] += 1 terms += 1 # compute relative observed agreement accuracy = accuracy / float(terms) pe = agreement_function(antr_1_stats, antr_2_stats, classes, terms) agreement = (accuracy - pe)/(1 - pe) return agreement # compute hypothetical probability of chance agreement described in # cohen's kappa method def cohen_kappa_hp(antr_1_stats, antr_2_stats, classes, terms): pe = 0 for c in classes: pe += antr_1_stats[c] * antr_2_stats[c] pe = pe / float((terms**2)) return pe # compute hypothetical probability of chance agreement described in # scott's pi method def scott_pi_hp(antr_1_stats, antr_2_stats, classes, terms): pe = 0 for c in classes: pe += ((antr_1_stats[c] + antr_2_stats[c])/float(terms))**2 return pe # ------------------ Agreement for multiple annotators --------------------------------------- def agreement_from_spreadsheet(): rows = [] # get data from google spreadsheet for i in range(0, 7): wsheet = sh.get_worksheet(i) row1 = wsheet.row_values(2) row2 = wsheet.row_values(4) rows.append(row1 + row2) return fleiss_kappa(rows, ANNOTATION_CLASSES) # Implementation of ideas found in # https://www.wikiwand.com/en/Fleiss%27_kappa # def fleiss_kappa(rows, classes): # we need to calculate, for each word and for each category, how # many annotators assigned that category for that word. # how many annotators do we have? n = len(rows) # how many subjects do we have at most? N_max = len(rows[0]) # how many classes do we have? k = len(classes) classes_to_num = { "ADJ": 0, "ADP": 1, "ADV": 2, "AUX": 3, "CCONJ": 4, "DET": 5, "INTJ": 6, "NOUN": 7, "NUM": 8, "PART": 9, "PRON": 10, "PROPN": 11, "PUNCT": 12, "SCONJ": 13, "SYM": 14, "VERB": 15, "X": 16 } # matrix[i][j] := number of raters who assigned the i-th subject # to the j-th category m = np.zeros((N_max, k)) for i in range(0, N_max): for a in range(0, n): if i < len(rows[a]) and rows[a][i] != '': m[i][classes_to_num[rows[a][i].upper()]] += 1 # P[i] = extent to which raters agree for the i-th subject P = [0] * N_max for i in range(0, N_max): for j in range(0, k): P[i] += m[i][j]**2 P[i] = P[i] - n P[i] = (1/float((n * (n-1)))) * P[i] # P_mean = mean of the Pi's P_mean = 0 for i in range(0, N_max): P_mean += P[i] P_mean = P_mean / float(N_max) # p[j] = proportion of all assignment which were to the j-th category p = [0] * k for j in range(0, k): for i in range(0, N_max): p[j] += m[i][j] p[j] = p[j] / (float)(N_max*n) # PE as defined in the wikipedia page PE = 0 for j in range(0, k): PE += p[j]**2 # final agreement return (P_mean - PE)/float(1 - PE) # ------------------ Testing area --------------------------------------- # print coefficients for given spreadsheet my_list = [1, 2, 3, 4, 5, 6, 7] function = cohen_kappa_hp for pair in combinations(my_list, 2): i, j = pair value = basic_agreement(ROWS[i-1], ROWS[j-1], ANNOTATION_CLASSES, function) print("Scott's pi coefficient for " + str(i) + ", " + str(j) + " is: " + str(value)) print(fleiss_kappa(ROWS, ANNOTATION_CLASSES))