From 401eb3c64c95262b0d065dad055144d34b22458c Mon Sep 17 00:00:00 2001 From: Luis Arias Date: Sat, 13 Apr 2024 11:00:39 +0200 Subject: [PATCH] Improved filtering for movies and shows --- site-observable/docs/data/films.sqlite.py | 15 +++++++++------ site-observable/docs/data/shows.sqlite.py | 10 +++++++++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/site-observable/docs/data/films.sqlite.py b/site-observable/docs/data/films.sqlite.py index c770651..0b747e2 100755 --- a/site-observable/docs/data/films.sqlite.py +++ b/site-observable/docs/data/films.sqlite.py @@ -1,7 +1,7 @@ import os import sqlite3 import tempfile -from datetime import datetime, timedelta +from datetime import datetime import pandas as pd @@ -18,12 +18,15 @@ # Remove adult movies df = df[df["adult"] == False] # noqa: E712 - # Calculate the date for the past two years - years_ago = datetime.now() - timedelta(days=365 * 2) - start_date = years_ago.replace(month=1, day=1) + # Remove documentaries + df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712 - # Filter the dataframe based on the start date - df = df[df["release_date"] >= start_date] + # Remove movies with a future release date + now = datetime.now() + df = df[df["release_date"] < now] + + # Remove movies with no known revenue + df = df[df["revenue"] > 0] # Add a column with the production_year based on the release_date df["production_year"] = df["release_date"].dt.year diff --git a/site-observable/docs/data/shows.sqlite.py b/site-observable/docs/data/shows.sqlite.py index e8211d7..2cfde7a 100755 --- a/site-observable/docs/data/shows.sqlite.py +++ b/site-observable/docs/data/shows.sqlite.py @@ -1,6 +1,7 @@ import os import sqlite3 import tempfile +from datetime import datetime import pandas as pd @@ -12,11 +13,18 @@ ) os.system("unzip full-tmdb-tv-shows-dataset-2023-150k-shows.zip >&2") - df = pd.read_csv("TMDB_tv_dataset_v3.csv") + df = pd.read_csv("TMDB_tv_dataset_v3.csv", parse_dates=["first_air_date"]) # Remove adult movies df = df[df["adult"] == False] # noqa: E712 + # Remove documentaries + df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712 + + # Remove shows with a future first air date or no first air date + now = datetime.now() + df = df[df["first_air_date"] < now] + # Select the columns we want df = df[["id", "name", "original_name", "poster_path"]]