Computing expiry
Once we have the subcategory of a document, computing the expiry is trivial.
Storing normal retention periods
First, store the regular retention periods of different subcategories into a dictionary:
retention = {}
for i in range(df.shape[0]):
retention[df.loc[i, "Ref"]] = str(df.loc[i, "Retention"]) + " " + str(df.loc[i, "Period"])
Preliminary computation
Next, make a dataframe with the expiry date calculated solely based on normal retention period:
def inner_join(dict1, dict2, dict3):
# Find common keys between dict1 and dict2
common_keys = set(dict1.keys()) & set(dict2.keys())
# Perform inner join and include values from dict3
result = {key: {'category': dict1[key], 'date': dict2[key], 'retention period': dict3[dict1[key]]} for key in common_keys if dict1[key] in dict3}
return result
result = inner_join(text_classification, dates, retention)
result.to_csv("result.csv", index=False)
Country Deviation
Finally, correct the expiry for country deviations
from dateutil.relativedelta import relativedelta
countries = {
"BR": "Brazil",
"CA": "Canada",
"CH": "Switzerland",
"CN": "China",
"DE": "Germany",
"FR": "France",
"GB": "United Kingdom",
"IN": "India",
"India": "India",
"US": "US"
}
for file in result.keys():
initials = file.split(" ")[0]
if initials in countries.keys():
result[file]["country"] = countries[initials]
else:
result[file]["country"] = "Global"
df = pd.read_csv("result.csv")
country = pd.read_csv("processed_Simplified.csv")[["Ref", "Country Deviation"]]
df["date"] = pd.to_datetime(df["date"])
merged_df = pd.merge(df, country, left_on='category', right_on='Ref', how='left')
def extend_date_by_deviation(row):
if pd.notnull(row['Country Deviation']):
deviations = row['Country Deviation'].split(', ')
for deviation in deviations:
country, period = deviation.split(': ')
if row['country'] == country:
if 'years' in period:
years = int(period.split(' ')[0])
if years:
return row['date'] + relativedelta(years=years)
elif 'months' in period:
months = int(period.split(' ')[0])
if months:
return row['date'] + relativedelta(months=months)
elif 'days' in period:
days = int(period.split(' ')[0])
if days:
return row['date'] + relativedelta(days=days)
elif "permanent" in period.lower() or "indefinite" in period.lower():
return "Permanent"
period = row['retention period']
if 'years' in period:
years = int(period.split(' ')[0])
if years:
return row['date'] + relativedelta(years=years)
elif 'months' in period:
months = int(period.split(' ')[0])
if months:
return row['date'] + relativedelta(months=months)
elif 'days' in period:
days = int(period.split(' ')[0])
if days:
return row['date'] + relativedelta(days=days)
elif "permanent" in period.lower() or "indefinite" in period.lower():
return "Permanent"
else:
return row['date']
merged_df['expiry date'] = merged_df.apply(extend_date_by_deviation, axis=1)
merged_df['expiry date'] = pd.to_datetime(merged_df['expiry date'], errors="coerce")
merged_df['expiry date'] = merged_df['expiry date'].dt.date
merged_df[["date", "expiry date"]] = merged_df[["date", "expiry date"]].astype(str)
merged_df = merged_df.drop("Ref", axis=1)[["file", "category", "date", "retention period", "country", "Country Deviation", "expiry date"]]