In [1]:
import seaborn as sns
import requests
import zipfile
import io
import polars as pl
from matplotlib import pyplot as plt
from datetime import datetime, timedelta
from funding_crawler.viz import count_plot, perc_comp


def count_plot(df, col):
    lst = [x for xs in df[col] if xs is not None for x in xs]

    series = pl.Series(col, lst)
    counts_df = (
        pl.DataFrame([series])
        .unpivot()
        .group_by("value")
        .len()
        .sort("len", descending=True)
        .rename({"value": col, "len": "count"})
    )

    sns.barplot(data=counts_df, y=col, x="count", order=counts_df[col])
    plt.show()
In [2]:
data_url = "https://foerderdatenbankdump.fra1.cdn.digitaloceanspaces.com/data/parquet_data.zip"
In [3]:
response = requests.get(data_url)
response.raise_for_status()

with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    file_name = z.namelist()[0]
    with z.open(file_name) as f:
        df = pl.read_parquet(f)

print(f"{len(df)} Förderprogramme")
print(df.columns)
print(df.dtypes)
2456 Förderprogramme
['id_hash', 'id_url', 'url', 'title', 'description', 'more_info', 'legal_basis', 'contact_info_institution', 'contact_info_street', 'contact_info_city', 'contact_info_fax', 'contact_info_phone', 'contact_info_email', 'contact_info_website', 'funding_type', 'funding_area', 'funding_location', 'eligible_applicants', 'funding_body', 'further_links', 'checksum', 'license_info', 'previous_update_dates', 'last_updated', 'on_website_from', 'deleted']
[String, String, String, String, String, String, String, String, String, String, String, String, String, String, List(String), List(String), List(String), List(String), String, List(String), String, String, List(Datetime(time_unit='us', time_zone=None)), Datetime(time_unit='us', time_zone=None), Datetime(time_unit='us', time_zone=None), Boolean]
In [4]:
current_date = datetime.today()

# Determine the most recent update date
if not df["last_updated"].is_null().all():
    most_recent_updated = df["last_updated"].max()
else:
    most_recent_updated = df["on_website_from"].max()

print("Today is:", current_date)
print("Most recent update (updates when crawler ran):", most_recent_updated)
Today is: 2025-06-22 00:38:55.735056
Most recent update (updates when crawler ran): 2025-06-21 02:01:10.878277
In [5]:
current_df = df.filter(pl.col("deleted") == False)
print(f"{len(current_df)} aktive Förderprogramme")
2455 aktive Förderprogramme

Descriptive Statistics¶

Funding Area¶

Multiple areas are possible per program

In [6]:
count_plot(df, "funding_area")
No description has been provided for this image

Funding Location¶

Multiple applicant locations are possible per program.

In [7]:
count_plot(df, "funding_location")
No description has been provided for this image

Eligible Applicant Types¶

Multiple applicant types are possible per program

In [8]:
count_plot(df, "eligible_applicants")
No description has been provided for this image

Funding Types¶

Multiple funding types are possible

In [9]:
count_plot(df, "funding_type")
No description has been provided for this image

Metadata¶

Tracking added and deleted programs¶

In [10]:
if len(df.filter(df["last_updated"].is_null())) != len(df):
    df = df.with_columns(
        pl.col("on_website_from").alias("creation_date")
    )

    df = df.with_columns(
        [pl.col("creation_date").dt.truncate("1w").alias("week_start_date")]
    )

    earliest_date = df.select(pl.col("creation_date").min()).item()
    df_filtered = df.filter(pl.col("creation_date") != earliest_date)

    weekly_added = (
        df_filtered.filter(~pl.col("deleted"))
        .group_by("week_start_date")
        .len()
        .rename({"len": "added_count"})
        .sort("week_start_date")
    )

    weekly_deleted = (
        df_filtered.filter(pl.col("deleted"))
        .group_by("week_start_date")
        .len()
        .rename({"len": "deleted_count"})
        .sort("week_start_date")
    )

    weekly_counts = weekly_added.join(
        weekly_deleted, on="week_start_date", how="full"
    ).fill_null(0)

    plt.figure(figsize=(12, 6))

    sns.lineplot(
        data=weekly_counts,
        x="week_start_date",
        y="added_count",
        marker="o",
        label="Added Programs",
    )
    sns.lineplot(
        data=weekly_counts,
        x="week_start_date",
        y="deleted_count",
        marker="o",
        label="Deleted Programs",
        color="red",
    )

    plt.title("Programs Added and Deleted Per Week")
    plt.xlabel("Date")
    plt.ylabel("Number of Programs")
    plt.legend()

    plt.gcf().autofmt_xdate()

    plt.tight_layout()
    plt.show()
else:
    print("No update history available yet")
No description has been provided for this image

Type of programs being deleted and updated¶

In [11]:
if len(df.filter(df["last_updated"].is_null())) != len(df):
    current_date = datetime.now()
    one_month_ago = current_date - timedelta(days=30)
    recent_del = df.filter(pl.col("creation_date") > one_month_ago).filter(
        pl.col("deleted")
    )

    perc_comp(
        recent_del,
        df,
        "funding_area",
        f"Comparison of percentage for funding areas deleted in last month ({len(recent_del)}) vs. percentage of total previous data",
    )
else:
    print("No update history available yet")
No description has been provided for this image
In [12]:
if len(df.filter(df["last_updated"].is_null())) != len(df):
    recent_add = df.filter(pl.col("creation_date") > one_month_ago).filter(
        ~pl.col("deleted") 
    )

    perc_comp(
        recent_add,
        df,
        "funding_area",
        f"Comparison of percentage of funding areas added in the last month ({len(recent_add)}) vs. percentage of total previous data",
    )
else:
    print("No update history available yet")
No description has been provided for this image