From 202e990c26e69d18e7a4961752e47d6a1871189b Mon Sep 17 00:00:00 2001 From: Luis Arias Date: Sat, 13 Apr 2024 10:33:09 +0200 Subject: [PATCH 1/2] Add invoke and implement clean-branches task --- README.md | 25 +++++++++++++++---------- poetry.lock | 44 +++++++++++++------------------------------- pyproject.toml | 1 + tasks.py | 6 ++++++ 4 files changed, 35 insertions(+), 41 deletions(-) create mode 100644 tasks.py diff --git a/README.md b/README.md index 36fc773..902d355 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,4 @@ -Observatoire des imaginaires -================ - +# Observatoire des imaginaires ## Installing with poetry @@ -65,7 +63,7 @@ pip install poetry jupyter notebook ``` -## Download datasets from kaggle +## Download datasets from kaggle If you want to use kaggle to download datasets, please make sure to have api's credentials in ~/.kaggle/kaggle.json. @@ -78,8 +76,8 @@ make download-tmdb-movies-dataset make download-full-tmdb-tv-shows-dataset ``` - Alternatively you can download directly the datasets from kaggle website : + - [tmdb-movies-dataset](https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies) - [full-tmdb-tv-shows-dataset](https://www.kaggle.com/datasets/asaniczka/full-tmdb-tv-shows-dataset-2023-150k-shows) @@ -87,15 +85,17 @@ Alternatively you can download directly the datasets from kaggle website : The [site-observable](https://github.com/dataforgoodfr/12_observatoire_des_imaginaires/tree/main/site-observable) directory contains an observable framework site that collect film and movie data from the above datasets on kaggle and filters the datasets according -to the following rules in order to reduced the size of the data present on the generated web site. This site provides a search UI -allow a user to select a specific movie or TV show. The user can then click on the link for their selection to kick off the +to the following rules in order to reduced the size of the data present on the generated web site. This site provides a search UI +allow a user to select a specific movie or TV show. The user can then click on the link for their selection to kick off the questionnaire on tally andis destined to be embedded in an iframe in the main Observatoire des Imaginaires web site. Movies: + - filter out adult movies - filter out movies released more that two years ago TV Shows: + - filter out adult shows The web site is currently hosted on the [Observable hosting platform](https://observablehq.com/) and is available at the following URL: @@ -106,10 +106,15 @@ https://observatoire-des-imaginaires.observablehq.cloud/questionnaire [Install precommits](https://pre-commit.com/) - - pre-commit run --all-files - + pre-commit run --all-files ## Use Tox to test your code tox -vv + +## Tasks + +This repo includes invoke for pythonic task execution. To see the +is of available tasks you can run: + +invoke -l diff --git a/poetry.lock b/poetry.lock index fd4a6b3..7cf0f4b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "anyio" @@ -642,6 +642,17 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "invoke" +version = "2.2.0" +description = "Pythonic task execution" +optional = false +python-versions = ">=3.6" +files = [ + {file = "invoke-2.2.0-py3-none-any.whl", hash = "sha256:6ea924cc53d4f78e3d98bc436b08069a03077e6f85ad1ddaa8a116d7dad15820"}, + {file = "invoke-2.2.0.tar.gz", hash = "sha256:ee6cbb101af1a859c7fe84f2a264c059020b0cb7fe3535f9424300ab568f6bd5"}, +] + [[package]] name = "ipykernel" version = "5.5.6" @@ -1788,13 +1799,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -2273,28 +2277,6 @@ webencodings = ">=0.4" doc = ["sphinx", "sphinx_rtd_theme"] test = ["flake8", "isort", "pytest"] -[[package]] -name = "six" -version = "1.16.0" -description = "Python 2 and 3 compatibility utilities" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, -] - -[[package]] -name = "text-unidecode" -version = "1.3" -description = "The most basic Text::Unidecode port" -optional = false -python-versions = "*" -files = [ - {file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"}, - {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, -] - [[package]] name = "tomli" version = "2.0.1" @@ -2528,4 +2510,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "eeb9ba8f43237f51301a5ec4c62d79a177ae19b81bbf920c0947d53ccca6ec00" +content-hash = "4f39c5e267d633fab26b42f0bffa6f1928ae0d8b3c3230112411a703c4762260" diff --git a/pyproject.toml b/pyproject.toml index 57841a9..dd1755c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ pre-commit = "^2.20.0" pytest = "^7.2.0" # ruff = "^0.0.254" tox = "^4.4.8" +invoke = "^2.2.0" [tool.ruff] select = [ diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..f44ff1e --- /dev/null +++ b/tasks.py @@ -0,0 +1,6 @@ +from invoke import Context, task + + +@task +def clean_branches(c: Context) -> None: + c.run("git branch --merged | grep -v '\\*\\|main' | xargs -n 1 git branch -d") From 401eb3c64c95262b0d065dad055144d34b22458c Mon Sep 17 00:00:00 2001 From: Luis Arias Date: Sat, 13 Apr 2024 11:00:39 +0200 Subject: [PATCH 2/2] Improved filtering for movies and shows --- site-observable/docs/data/films.sqlite.py | 15 +++++++++------ site-observable/docs/data/shows.sqlite.py | 10 +++++++++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/site-observable/docs/data/films.sqlite.py b/site-observable/docs/data/films.sqlite.py index c770651..0b747e2 100755 --- a/site-observable/docs/data/films.sqlite.py +++ b/site-observable/docs/data/films.sqlite.py @@ -1,7 +1,7 @@ import os import sqlite3 import tempfile -from datetime import datetime, timedelta +from datetime import datetime import pandas as pd @@ -18,12 +18,15 @@ # Remove adult movies df = df[df["adult"] == False] # noqa: E712 - # Calculate the date for the past two years - years_ago = datetime.now() - timedelta(days=365 * 2) - start_date = years_ago.replace(month=1, day=1) + # Remove documentaries + df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712 - # Filter the dataframe based on the start date - df = df[df["release_date"] >= start_date] + # Remove movies with a future release date + now = datetime.now() + df = df[df["release_date"] < now] + + # Remove movies with no known revenue + df = df[df["revenue"] > 0] # Add a column with the production_year based on the release_date df["production_year"] = df["release_date"].dt.year diff --git a/site-observable/docs/data/shows.sqlite.py b/site-observable/docs/data/shows.sqlite.py index e8211d7..2cfde7a 100755 --- a/site-observable/docs/data/shows.sqlite.py +++ b/site-observable/docs/data/shows.sqlite.py @@ -1,6 +1,7 @@ import os import sqlite3 import tempfile +from datetime import datetime import pandas as pd @@ -12,11 +13,18 @@ ) os.system("unzip full-tmdb-tv-shows-dataset-2023-150k-shows.zip >&2") - df = pd.read_csv("TMDB_tv_dataset_v3.csv") + df = pd.read_csv("TMDB_tv_dataset_v3.csv", parse_dates=["first_air_date"]) # Remove adult movies df = df[df["adult"] == False] # noqa: E712 + # Remove documentaries + df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712 + + # Remove shows with a future first air date or no first air date + now = datetime.now() + df = df[df["first_air_date"] < now] + # Select the columns we want df = df[["id", "name", "original_name", "poster_path"]]