#!/usr/bin/env python3 # Code taken from lecture 12 of IR course done on 06/12/19 from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer import os import numpy as np import pandas as pd # import pickle # If you don't want to index the entire dataset, you can add only a # specific subfolder using some of the following values: alt.atheism, # comp.graphics, comp.windows.x, misc.forsale. # DATASET = "20_newsgroups/comp.graphics" DATASET = "20_newsgroups" paths = [] for (dirpath, dirnames, filenames) in os.walk(str(os.getcwd())+'/'+DATASET+'/'): for i in filenames: paths.append(str(dirpath)+str("/")+i) # ----------------------------------- # Pre-processing chain # ----------------------------------- def remove_header(data): try: ind = data.index('\n\n') data = data[ind:] except Exception: print("No Header") return data def convert_lower_case(data): return np.char.lower(data) def remove_stop_words(data): stop_words = stopwords.words('english') words = word_tokenize(str(data)) new_text = "" for w in words: if w not in stop_words: new_text = new_text + " " + w return np.char.strip(new_text) def remove_punctuation(data): symbols = "!\"#$%&()*+-./:;<=>?@[\\]^_`{|}~\n" for i in range(len(symbols)): data = np.char.replace(data, symbols[i], ' ') data = np.char.replace(data, " ", " ") data = np.char.replace(data, ',', '') return data def remove_apostrophe(data): return np.char.replace(data, "'", "") def remove_single_characters(data): words = word_tokenize(str(data)) new_text = "" for w in words: if len(w) > 1: new_text = new_text + " " + w return np.char.strip(new_text) def convert_numbers(data): data = np.char.replace(data, "0", " zero ") data = np.char.replace(data, "1", " one ") data = np.char.replace(data, "2", " two ") data = np.char.replace(data, "3", " three ") data = np.char.replace(data, "4", " four ") data = np.char.replace(data, "5", " five ") data = np.char.replace(data, "6", " six ") data = np.char.replace(data, "7", " seven ") data = np.char.replace(data, "8", " eight ") data = np.char.replace(data, "9", " nine ") return data def stemming(data): stemmer = PorterStemmer() tokens = word_tokenize(str(data)) new_text = "" for w in tokens: new_text = new_text + " " + stemmer.stem(w) return np.char.strip(new_text) def preprocess(data, query): if not query: data = remove_header(data) data = convert_lower_case(data) data = convert_numbers(data) data = remove_punctuation(data) # remove comma seperately data = remove_stop_words(data) data = remove_apostrophe(data) data = remove_single_characters(data) data = stemming(data) return data # ----------------------------------- # Inverted Index Construction # ----------------------------------- doc = 0 postings = pd.DataFrame() for path in paths[0:50]: file = open(path, 'r', encoding='cp1250') text = file.read().strip() file.close() preprocessed_text = preprocess(text, False) tokens = word_tokenize(str(preprocessed_text)) for token in tokens: if token in postings: p = postings[token][0] p.add(doc) postings[token][0] = p else: postings.insert(value=[{doc}], loc=0, column=token) doc += 1 # NOTE: use picke to save it on local disk # # to write: postings.to_pickle(DATASET + "_unigram_postings") # to read: postings = pd.read_pickle(title + "_unigram_postings") def get_posting(word): if word in postings: return postings[word][0] else: return [] def print_word_postings(word): preprocessed_word = str(preprocess(word, True)) print(preprocessed_word) postings = get_posting(preprocessed_word) print("Document Frequency:", len(postings)) print("Postings List:", postings) # ----------------------------------- # Query language implementation # ----------------------------------- def get_not(word): a = get_posting(word) b = set(range(len(paths))) return b.difference(a) def generate_command_tokens(query): query = query.lower() tokens = word_tokenize(query) command_tokens = [] query_tokens = [] for t in tokens: if t not in ['and', 'or', 'not']: processed_token = preprocess([t], True) query_tokens.append(str(processed_token)) else: command_tokens.append(t) return command_tokens, query_tokens def generate_not_tuple(query_tokens, command_tokens): tup = set() # as long as there are NOT command to execute while 'not' in command_tokens: i = command_tokens.index('not') word = query_tokens[i] word_postings = get_not(word) # get result of NOT tup.update(word_postings) command_tokens.pop(i) # remove command # Replace the word with an ID to signal that it has been # processed # # NOTE: numbers have been already removed in the preprocessing # stage. query_tokens[i] = i print("\nAfter Not Processing:", command_tokens, query_tokens) return tup def binary_operations(query_tokens, command_tokens, tup): if not query_tokens[0]: return tup a = get_posting(query_tokens[0]) query_tokens.pop(0) for i in range(len(command_tokens)): if type(query_tokens[i]) == int: b = tup else: b = get_posting(query_tokens[i]) if command_tokens[i] == 'and': a = a.intersection(b) elif command_tokens[i] == 'or': a = a.union(b) else: print("Invalid Command") return a def execute_query(query): command_tokens, query_tokens = generate_command_tokens(query) tup = generate_not_tuple(query_tokens, command_tokens) print("-----------------------------------------") print("\nCommand Tokens:", command_tokens) print("\nQuery Tokens:", query_tokens) print("\nNot Tup:", len(tup)) final_set = binary_operations(query_tokens, command_tokens, tup) final_set = sorted(final_set) print("\nFinal Set:", final_set) print("-----------------------------------------") return final_set execute_query("place") execute_query("welcome") execute_query("authority") execute_query("place and not authority") execute_query("place or authority") execute_query("place or authority and not welcome")