Skip to main content

Preprocessing

The first step is to extract all the text from a document and preprocess it into a format suitable to be analyzed.

Tech stack

The packages used for this level are:

  • pandas: To read and manipulate dataframes

  • fitz: To extract text from documents

  • nltk: To remove stopwords and extract important words

  • spacy: To extract dates from the documents

Processing retention schedule

The retention schedule is given as an excel sheet with lots of empty/redundant rows and columns. It is first processed into a clean csv file.

import pandas as pd

xls = pd.read_excel(file_path, sheet_name=None)
csv_files = {sheet_name: sheet.to_csv(index=False) for sheet_name, sheet in xls.items()}
for sheet_name, csv_data in csv_files.items():
with open(f"{sheet_name}.csv", "w") as csv_file:
csv_file.write(csv_data)

def preprocess_csv(file_path):
df = pd.read_csv(file_path)
df = df.drop([0, 1, 2]).dropna(how='all', axis=1).dropna(how='all', axis=0).reset_index(drop=True)
new_header = df.iloc[0]
df = df.drop(0)
df.columns = new_header
df = df.dropna(subset=['Ref'], how='any').reset_index(drop=True)
df.to_csv('processed_Simplified.csv', index=False, header=True)
return None

preprocess_csv("Simplified.csv")

Extract text

Next, each document is read and its text contents are stored in a dictionary.

import fitz
import os

def extract_text_from_pdf(pdf_file, folder_path):
text = ""
with fitz.open(os.path.join(folder_path, pdf_file)) as doc:
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
text += page.get_text("text")
return text

folder_path = './repository'

pdf_files = []
for root, dirs, files in os.walk(folder_path):
for file in files:
pdf_files.append(file)

file_contents = {}
for pdf_file in pdf_files:
file_contents[pdf_file] = {"file_name": pdf_file, "text": (extract_text_from_pdf(pdf_file, folder_path))}

Process text

Now the text contents of each document is processed into a format suitable for analysis.

Remove hyphens and replace ligatures

First, hyphens are removed and ligatures ("st", "Æ", etc) are replaced by standard characters.

from typing import List

def replace_ligatures(text: str) -> str:
ligatures = {
"ff": "ff",
"fi": "fi",
"fl": "fl",
"ffi": "ffi",
"ffl": "ffl",
"ſt": "ft",
"st": "st",
"Ꜳ": "AA",
"Æ": "AE",
"ꜳ": "aa",
}
for search, replace in ligatures.items():
text = text.replace(search, replace)
return text

def remove_hyphens(text: str) -> str:
"""
This fails for:
* Natural dashes: well-known, self-replication, use-cases, non-semantic,
Post-processing, Window-wise, viewpoint-dependent
* Trailing math operands: 2 - 4
* Names: Lopez-Ferreras, VGG-19, CIFAR-100
"""
lines = [line.rstrip() for line in text.split("\n")]
line_numbers = []
for line_no, line in enumerate(lines[:-1]):
if line.endswith("-"):
line_numbers.append(line_no)
for line_no in line_numbers:
lines = dehyphenate(lines, line_no)
return "\n".join(lines)


def dehyphenate(lines: List[str], line_no: int) -> List[str]:
next_line = lines[line_no + 1]
word_suffix = next_line.split(" ")[0]
lines[line_no] = lines[line_no][:-1] + word_suffix
lines[line_no + 1] = lines[line_no + 1][len(word_suffix):]
return lines

for file in file_contents.keys():
file_contents[file]["text"] = remove_hyphens(replace_ligatures(file_contents[file]["text"]))

Remove stopwords

Stopwords are words like articles (a, an, the), conjunctions (of, in, at), etc which don't add any semantic meaning to the text. Removing them and retaining only the meaningful words results in better analysis.

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def extract_important_words(text):
words = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
tagged_words = pos_tag(filtered_words)
important_words = [word for word, tag in tagged_words if tag in ('NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS')]
return ' '.join(important_words)

for file in file_contents.keys():
file_contents[file]["imp_words"] = extract_important_words(file_contents[file]["text"])

Extract date

Next, we get the date of creation of the document from the text contents.

import spacy
from dateutil import parser

nlp = spacy.load("en_core_web_sm")

def extract_all_dates(text):
doc = nlp(text)
dates = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
return dates

def convert(dates):
formatted_dates = []
for date in dates:
try:
parsed_date = parser.parse(date, fuzzy=True)
formatted_date = parsed_date.strftime('%Y-%m-%d')

# Retain only the dates which are not impossible
if formatted_date >= "2010-01-01" and formatted_date <= "2024-04-10":
formatted_dates.append(formatted_date)
except:
# If the date cannot be parsed, move on to the next one since it is invalid
continue
return formatted_dates

dates = {}
for file in file_contents.keys():
text = file_contents[file]["text"]
all_dates = convert(extract_all_dates(text))

if all_dates:
dates[file] = all_dates[0]