#!/usr/bin/env python3 """This scripts shwocases a possible usage of the python library BeautifulSoup4 by extracing the lecture schedure data from the official site for computer science @ Tor Vergata. """ from bs4 import BeautifulSoup import requests import os URL = "http://www.informatica.uniroma2.it/pages/trien/orario/orario.htm" INPUT_FILE = "orario.html" OUTPUT_FILE = "orario.csv" TEXT = "" COURSE_CODES = { "Analisi Matematica": "AM", "Architettura dei sistemi di elaborazione": "AE", "Fisica": "FI", "Geometria ed algebra": "GA", "Logica e reti logiche": "LRL", "Matematica discreta": "MD", "Programmazione dei calcolatori con laboratorio": "PR", "Algoritmi e strutture dati": "ASD", "Basi di dati e di conoscenza": "BDC", "Calcolo delle probabilità e statistica": "CP", "Fondamenti di informatica": "FO", "Linguaggi e metodologie di programmazione": "LMP", "Ricerca operativa": "RO", "Sistemi operativi e reti": "SOR", "Algoritmi e strutture dati 2": "ASD2", "Calcolo numerico": "CN", "Ingegneria del software": "IS", "Intelligenza artificiale 1": "IA", "Lingua inglese": "LING", "Modelli e linguaggi di simulazione": "MLS", "Programmazione Java per dispositivi mobili": "PJDM", "Programmazione Web": "PW" } # ------------------------------- # Part 1 - Get the data # ------------------------------- if not os.path.exists(INPUT_FILE): # -- if we don't have the file, download it and save it print("About to download...") r = requests.get(URL) if r.status_code != 200: print("Could not download page!") exit() # -- prettify text TEXT = r.text f = open(INPUT_FILE, "w+") f.write(TEXT) f.close() else: print("Reading from file...") # -- otherwise simply read html from file f = open(INPUT_FILE, "r") TEXT = f.read() f.close() # --------------------------------- # Part 2 - Parse and write the data # --------------------------------- soup = BeautifulSoup(TEXT, 'html.parser') title = soup.find("h1").decode_contents().strip() # -- compute current sem semester = "" if "primo" in title: semester = "1" elif "secondo" in title: semester = "2" table = soup.find("table") rows = table.find_all("tr") current_year = 0 skip = False default_room = "" with open(OUTPUT_FILE, "w+") as out: # -- first row with metadata out.write("anno,ora,lunedì,martedì,mercoledì,giovedì,venerdì\n") for row in rows: if skip: skip = False continue if row.find("h2"): current_year += 1 # -- skip next row skip = True else: # -- get juicy data cols = row.find_all("td") hour = cols[0].decode_contents().strip() # -- schedule[i] := course of i-th day of the week for a # -- specified hour. schedule = [] for col in cols[1:]: if col.find("a"): course = col.a.decode_contents().strip() schedule.append(COURSE_CODES[course]) else: schedule.append("X") schedule_str = ",".join(schedule) out.write(f"{current_year},{hour},{schedule_str}\n")