Skip to content

Commit

Permalink
Improved filtering for movies and shows
Browse files Browse the repository at this point in the history
  • Loading branch information
kaaloo committed Apr 13, 2024
1 parent 202e990 commit 401eb3c
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 7 deletions.
15 changes: 9 additions & 6 deletions site-observable/docs/data/films.sqlite.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import sqlite3
import tempfile
from datetime import datetime, timedelta
from datetime import datetime

import pandas as pd

Expand All @@ -18,12 +18,15 @@
# Remove adult movies
df = df[df["adult"] == False] # noqa: E712

# Calculate the date for the past two years
years_ago = datetime.now() - timedelta(days=365 * 2)
start_date = years_ago.replace(month=1, day=1)
# Remove documentaries
df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712

# Filter the dataframe based on the start date
df = df[df["release_date"] >= start_date]
# Remove movies with a future release date
now = datetime.now()
df = df[df["release_date"] < now]

# Remove movies with no known revenue
df = df[df["revenue"] > 0]

# Add a column with the production_year based on the release_date
df["production_year"] = df["release_date"].dt.year
Expand Down
10 changes: 9 additions & 1 deletion site-observable/docs/data/shows.sqlite.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import sqlite3
import tempfile
from datetime import datetime

import pandas as pd

Expand All @@ -12,11 +13,18 @@
)
os.system("unzip full-tmdb-tv-shows-dataset-2023-150k-shows.zip >&2")

df = pd.read_csv("TMDB_tv_dataset_v3.csv")
df = pd.read_csv("TMDB_tv_dataset_v3.csv", parse_dates=["first_air_date"])

# Remove adult movies
df = df[df["adult"] == False] # noqa: E712

# Remove documentaries
df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712

# Remove shows with a future first air date or no first air date
now = datetime.now()
df = df[df["first_air_date"] < now]

# Select the columns we want
df = df[["id", "name", "original_name", "poster_path"]]

Expand Down

0 comments on commit 401eb3c

Please sign in to comment.