Computing expiry

Once we have the subcategory of a document, computing the expiry is trivial.

Storing normal retention periods

First, store the regular retention periods of different subcategories into a dictionary:

retention = {}
for i in range(df.shape[0]):
    retention[df.loc[i, "Ref"]] = str(df.loc[i, "Retention"]) + " " + str(df.loc[i, "Period"])

Preliminary computation

Next, make a dataframe with the expiry date calculated solely based on normal retention period:

def inner_join(dict1, dict2, dict3):
    # Find common keys between dict1 and dict2
    common_keys = set(dict1.keys()) & set(dict2.keys())

    # Perform inner join and include values from dict3
    result = {key: {'category': dict1[key], 'date': dict2[key], 'retention period': dict3[dict1[key]]} for key in common_keys if dict1[key] in dict3}

    return result

result = inner_join(text_classification, dates, retention)
result.to_csv("result.csv", index=False)

Country Deviation

Finally, correct the expiry for country deviations

from dateutil.relativedelta import relativedelta

countries = {
    "BR": "Brazil",
    "CA": "Canada",
    "CH": "Switzerland",
    "CN": "China",
    "DE": "Germany",
    "FR": "France",
    "GB": "United Kingdom",
    "IN": "India",
    "India": "India",
    "US": "US"
}

for file in result.keys():
    initials = file.split(" ")[0]
    if initials in countries.keys():
        result[file]["country"] = countries[initials]
    else:
        result[file]["country"] = "Global"

df = pd.read_csv("result.csv")
country = pd.read_csv("processed_Simplified.csv")[["Ref", "Country Deviation"]]
df["date"] = pd.to_datetime(df["date"])
merged_df = pd.merge(df, country, left_on='category', right_on='Ref', how='left')

def extend_date_by_deviation(row):
    if pd.notnull(row['Country Deviation']):
        deviations = row['Country Deviation'].split(', ')
        for deviation in deviations:
            country, period = deviation.split(': ')
            if row['country'] == country:
                if 'years' in period:
                    years = int(period.split(' ')[0])
                    if years:
                        return row['date'] + relativedelta(years=years)
                elif 'months' in period:
                    months = int(period.split(' ')[0])
                    if months:
                        return row['date'] + relativedelta(months=months)
                elif 'days' in period:
                    days = int(period.split(' ')[0])
                    if days:
                        return row['date'] + relativedelta(days=days)
                elif "permanent" in period.lower() or "indefinite" in period.lower():
                    return "Permanent"
    period = row['retention period']
    if 'years' in period:
        years = int(period.split(' ')[0])
        if years:
            return row['date'] + relativedelta(years=years)
    elif 'months' in period:
        months = int(period.split(' ')[0])
        if months:
            return row['date'] + relativedelta(months=months)
    elif 'days' in period:
        days = int(period.split(' ')[0])
        if days:
            return row['date'] + relativedelta(days=days)
    elif "permanent" in period.lower() or "indefinite" in period.lower():
        return "Permanent"
    else:
        return row['date']

merged_df['expiry date'] = merged_df.apply(extend_date_by_deviation, axis=1)
merged_df['expiry date'] = pd.to_datetime(merged_df['expiry date'], errors="coerce")
merged_df['expiry date'] = merged_df['expiry date'].dt.date
merged_df[["date", "expiry date"]] = merged_df[["date", "expiry date"]].astype(str)
merged_df = merged_df.drop("Ref", axis=1)[["file", "category", "date", "retention period", "country", "Country Deviation", "expiry date"]]

Computing expiry

Storing normal retention periods​

Preliminary computation​

Country Deviation​

Storing normal retention periods

Preliminary computation

Country Deviation