#!/usr/bin/python3 """ The point of this script is to build a vector space model for the the reuters21578 dataset using basic tf-id weighing. ------------------------------- The Reuters-21578, Distribution 1.0 test collection is available from David D. Lewis' professional home page, currently: http://www.research.att.com/~lewis Besides this README file, the collection consists of 22 data files, an SGML DTD file describing the data file format, and six files describing the categories used to index the data. (See Sections VI and VII for more details.) Some additional files, which are not part of the collection but have been contributed by other researchers as useful resources are also included. All files are available uncompressed, and in addition a single gzipped Unix tar archive of the entire distribution is available as reuters21578.tar.gz. The text categorization mailing list, DDLBETA, is a good place to send questions about this collection and other text categorization issues. You may join the list by writing David Lewis at lewis@research.att.com. ------------------------------- reuters21578 Dataset categories: Number of Number of Categories Number of Categories Category Set Categories w/ 1+ Occurrences w/ 20+ Occurrences ************ ********** ******************** ******************** EXCHANGES 39 32 7 ORGS 56 32 9 PEOPLE 267 114 15 PLACES 175 147 60 TOPICS 135 120 57 """ """ TODOs: - Do feature selection on words of dictionary """ from bs4 import BeautifulSoup import traceback import math from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer import numpy as np # -- mine import myutil # ------------------------------ # Load dataset in python objects # ----- DATASETS_PATH = "../../datasets/" ORIG_REUTERS_DATASET_PATH = DATASETS_PATH + "/reuters21578_orig/" REUTERS_DATASET_PATH = DATASETS_PATH + "/reuters21578/reuters21578_dataset.pickle" REUTERS_DICTIONARY_PATH = DATASETS_PATH + "/reuters21578/reuters21578_dictionary.pickle" class Document(object): def __init__(self, topics, title, date, body): self.topics = topics self.title = title self.date = date self.body = body self.vsm_tuple = [] self.freqs = {} def get_tokens(self): return word_tokenize(str(pre_process(self.body))) # ---- def dump_dataset(): """Store original Reuters-21578 dataset as a list of python Document() objects. """ tot_docs = [] for i in range(0, 22): print(f"Processing file: {i}") filename = f"reut2-00{i}.sgm" if i < 10 else f"reut2-0{i}.sgm" tot_docs += read_docs(ORIG_REUTERS_DATASET_PATH + filename) create_dictionary(tot_docs) def load_dataset(): return myutil.load_pickle(REUTERS_DATASET_PATH) # ---- def read_docs(file_path): final_docs = [] with open(file_path, "r") as infile: text = infile.read() soup = BeautifulSoup(text, 'lxml') docs = soup.find_all("reuters") for doc in docs: # -- get all topics topic_sets = ["topics", "places", "people", "orgs", "exchanges", "companies"] doc_topics = [] for topic_set in topic_sets: doc_topics += [topic.decode_contents().strip() for topic in doc.find(topic_set).find_all("d")] if doc_topics == []: # -- skip docs with no topics continue try: doc_body = "" doc_date = "" doc_title = "" doc_date = "" if doc.find("dateline"): # -- with lxml cannot find body tag directly (?) doc_body = str(doc.find("dateline").next.next) doc_date = doc.find("dateline").decode_contents().strip() if doc.find("title"): doc_title = doc.find("title").decode_contents().strip() elif doc.find("text"): doc_body = doc.find("text").decode_contents().strip() except Exception as e: print(doc) traceback.print_exc() print(e) return final_docs.append(Document(doc_topics, doc_title, doc_date, doc_body)) print(f"Total n. of docs: {len(final_docs)}") return final_docs # ------------------- # Dictionary stuff # ----- def create_dictionary(docs): """ Creates a dictionary out of a bunch of documents. For each word the dictionary maintains the number of distinct docs in which that word appears. """ dictionary = {} for i, doc in enumerate(docs): print(f"Processing doc: {i}") seen_in_doc = {} pre_processed_text = pre_process(doc.body) freqs = {} for token in word_tokenize(str(pre_processed_text)): # -- update freqs for current doc freqs[token] = freqs[token] + 1 if token in freqs else 1 # -- update dictionary if token not in dictionary: dictionary[token] = 1 elif token not in seen_in_doc: # -- increment count only for first occurrence in # -- current doc dictionary[token] += 1 seen_in_doc[token] = True # -- save freqs for vsm creation doc.freqs = freqs # -- write to disk myutil.store_pickle(dictionary, REUTERS_DICTIONARY_PATH) myutil.store_pickle(docs, REUTERS_DATASET_PATH) def load_dictionary(): return load_pickle(REUTERS_DICTIONARY_PATH) def feature_selection(dictionary): # TODO: actually implement feature selection instead of this shit N = 100 small_dict = {} for i, word in enumerate(dictionary): if i >= N: break small_dict[word] = dictionary[word] return small_dict # ------------------- # Preprocessing chain # ----- def convert_lower_case(data): return np.char.lower(data) def convert_numbers(data): num2word = { "0": " zero ", "1": " one ", "2": " two ", "3": " three ", "4": " four ", "5": " five ", "6": " six ", "7": " seven ", "8": " eight ", "9": " nine ", } for i in range(0, 10): data = np.char.replace(data, str(i), num2word[str(i)]) return data def remove_punctuation(data): symbols = "!\"#$%&()*+-./:;<=>?@[\\]^_`{|}~\n" for i in range(len(symbols)): data = np.char.replace(data, symbols[i], ' ') data = np.char.replace(data, " ", " ") data = np.char.replace(data, ',', '') return data def remove_stop_words(data): stop_words = stopwords.words('english') words = word_tokenize(str(data)) new_text = "" for w in words: if w not in stop_words: new_text = new_text + " " + w return np.char.strip(new_text) def remove_apostrophe(data): return np.char.replace(data, "'", "") def remove_single_characters(data): words = word_tokenize(str(data)) new_text = "" for w in words: if len(w) > 1: new_text = new_text + " " + w return np.char.strip(new_text) def stemming(data): stemmer = PorterStemmer() tokens = word_tokenize(str(data)) new_text = "" for w in tokens: new_text = new_text + " " + stemmer.stem(w) return np.char.strip(new_text) def pre_process(data): data = convert_lower_case(data) data = convert_numbers(data) data = remove_punctuation(data) data = remove_stop_words(data) data = remove_apostrophe(data) data = remove_single_characters(data) data = stemming(data) return data # ------------------- # Creation of VSM # ----- def idf(word, tot_docs, dictionary): """ Computes inverse-document-frequency (idf). """ return math.log(tot_docs / dictionary[word]) def tf(word, doc): """ Computes term-frequency (tf). """ return 1 + math.log(doc.freqs[word]) if word in doc.freqs else 0 def create_vsm(docs, dictionary): """ This function updates the contents of each docs to contain a tuple which is the representation of the doc in the vector space model we're building. """ words = dictionary.keys() tot_docs = len(docs) for i, doc in enumerate(docs): print(f"VSM: Processing doc: {i}") doc_t = [] for word in words: score = tf(word, doc) * idf(word, tot_docs, dictionary) doc_t.append(score) doc.vsm_tuple = doc_t # -- write to disk myutil.store_pickle(docs, REUTERS_DATASET_PATH) # ------------------- if __name__ == "__main__": dump_dataset() # print("hello") # docs = load_dataset() # basic_dic = load_dictionary() # cool_dic = feature_selection(basic_dic) # create_vsm(docs, cool_dic)