Merge pull request #19 from dataforgoodfr/tmdb-filtering

Improve filtering of TMDB data
dataforgoodfr · Apr 13, 2024 · 3212958 · 3212958
2 parents 95fc990 + 401eb3c
commit 3212958
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,4 @@
-Observatoire des imaginaires
-================
-
+# Observatoire des imaginaires
 
 ## Installing with poetry
 
@@ -65,7 +63,7 @@ pip install poetry
    jupyter notebook
    ```
 
-## Download datasets from kaggle 
+## Download datasets from kaggle
 
 If you want to use kaggle to download datasets, please make sure to have api's credentials in ~/.kaggle/kaggle.json.
 
@@ -78,24 +76,26 @@ make download-tmdb-movies-dataset
 make download-full-tmdb-tv-shows-dataset
 ```
 
-
 Alternatively you can download directly the datasets from kaggle website :
+
 - [tmdb-movies-dataset](https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies)
 - [full-tmdb-tv-shows-dataset](https://www.kaggle.com/datasets/asaniczka/full-tmdb-tv-shows-dataset-2023-150k-shows)
 
 ## Website to select a specific movie or TV show
 
 The [site-observable](https://github.com/dataforgoodfr/12_observatoire_des_imaginaires/tree/main/site-observable) directory contains
 an observable framework site that collect film and movie data from the above datasets on kaggle and filters the datasets according
-to the following rules in order to reduced the size of the data present on the generated web site.  This site provides a search UI
-allow a user to select a specific movie or TV show.  The user can then click on the link for their selection to kick off the
+to the following rules in order to reduced the size of the data present on the generated web site. This site provides a search UI
+allow a user to select a specific movie or TV show. The user can then click on the link for their selection to kick off the
 questionnaire on tally andis destined to be embedded in an iframe in the main Observatoire des Imaginaires web site.
 
 Movies:
+
 - filter out adult movies
 - filter out movies released more that two years ago
 
 TV Shows:
+
 - filter out adult shows
 
 The web site is currently hosted on the [Observable hosting platform](https://observablehq.com/) and is available at the following URL:
@@ -106,10 +106,15 @@ https://observatoire-des-imaginaires.observablehq.cloud/questionnaire
 
 [Install precommits](https://pre-commit.com/)
 
-
-    pre-commit run --all-files 
-
+    pre-commit run --all-files
 
 ## Use Tox to test your code
 
     tox -vv
+
+## Tasks
+
+This repo includes invoke for pythonic task execution. To see the
+is of available tasks you can run:
+
+invoke -l
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ pre-commit = "^2.20.0"
 pytest = "^7.2.0"
 # ruff = "^0.0.254"
 tox = "^4.4.8"
+invoke = "^2.2.0"
 
 [tool.ruff]
 select = [

diff --git a/site-observable/docs/data/films.sqlite.py b/site-observable/docs/data/films.sqlite.py
@@ -1,7 +1,7 @@
 import os
 import sqlite3
 import tempfile
-from datetime import datetime, timedelta
+from datetime import datetime
 
 import pandas as pd
 
@@ -18,12 +18,15 @@
     # Remove adult movies
     df = df[df["adult"] == False]  # noqa: E712
 
-    # Calculate the date for the past two years
-    years_ago = datetime.now() - timedelta(days=365 * 2)
-    start_date = years_ago.replace(month=1, day=1)
+    # Remove documentaries
+    df = df[df["genres"].str.contains("Documentary") == False]  # noqa: E712
 
-    # Filter the dataframe based on the start date
-    df = df[df["release_date"] >= start_date]
+    # Remove movies with a future release date
+    now = datetime.now()
+    df = df[df["release_date"] < now]
+
+    # Remove movies with no known revenue
+    df = df[df["revenue"] > 0]
 
     # Add a column with the production_year based on the release_date
     df["production_year"] = df["release_date"].dt.year

diff --git a/site-observable/docs/data/shows.sqlite.py b/site-observable/docs/data/shows.sqlite.py
@@ -1,6 +1,7 @@
 import os
 import sqlite3
 import tempfile
+from datetime import datetime
 
 import pandas as pd
 
@@ -12,11 +13,18 @@
     )
     os.system("unzip full-tmdb-tv-shows-dataset-2023-150k-shows.zip >&2")
 
-    df = pd.read_csv("TMDB_tv_dataset_v3.csv")
+    df = pd.read_csv("TMDB_tv_dataset_v3.csv", parse_dates=["first_air_date"])
 
     # Remove adult movies
     df = df[df["adult"] == False]  # noqa: E712
 
+    # Remove documentaries
+    df = df[df["genres"].str.contains("Documentary") == False]  # noqa: E712
+
+    # Remove shows with a future first air date or no first air date
+    now = datetime.now()
+    df = df[df["first_air_date"] < now]
+
     # Select the columns we want
     df = df[["id", "name", "original_name", "poster_path"]]
 

diff --git a/tasks.py b/tasks.py
@@ -0,0 +1,6 @@
+from invoke import Context, task
+
+
+@task
+def clean_branches(c: Context) -> None:
+    c.run("git branch --merged | grep -v '\\*\\|main' | xargs -n 1 git branch -d")