diff --git a/notebooks/create_tmdb_subsets.ipynb b/notebooks/create_tmdb_subsets.ipynb index d432166..e62e131 100644 --- a/notebooks/create_tmdb_subsets.ipynb +++ b/notebooks/create_tmdb_subsets.ipynb @@ -47,10 +47,10 @@ "\n", "EXPORT_TMDB_SUBSETS_TO = Path(\"../data/tmdb_subsets\").resolve()\n", "\n", - "MOVIES_COLUMNS_OF_INTEREST = ['title', 'original_title', 'release_date', 'production_countries', 'genres', 'production_companies', 'vote_average', 'revenue']\n", + "MOVIES_COLUMNS_OF_INTEREST = ['title', 'original_title', 'release_date', 'production_countries', 'genres', 'production_companies']\n", "NB_MOVIES_SUBSET = 5000\n", "\n", - "TVSHOWS_COLUMNS_OF_INTEREST = ['name', 'original_name', 'first_air_date', 'production_countries', 'genres', 'production_companies', 'vote_average']\n", + "TVSHOWS_COLUMNS_OF_INTEREST = ['name', 'original_name', 'first_air_date', 'production_countries', 'genres', 'production_companies']\n", "NB_TVSHOWS_SUBSET = 5000" ] }, @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "id": "bf19fe99", "metadata": {}, "outputs": [], @@ -92,6 +92,35 @@ "df_tvshows = pd.read_csv(EXTRACT_TWSHOWS_ZIP_TO / os.listdir(EXTRACT_TWSHOWS_ZIP_TO)[0])" ] }, + { + "cell_type": "code", + "execution_count": 26, + "id": "ad152395", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6497 movies are note uniquely identify by original_title & release_year on 797541 movies (0.81%)\n" + ] + } + ], + "source": [ + "df = df_movies[(df_movies['status'] == 'Released') & \n", + " (~df_movies['adult']) &\n", + " (~df_movies['release_date'].isna())].copy()\n", + "\n", + "df['release_year'] = df['release_date'].apply(lambda date : date[0:4]).astype(int)\n", + "\n", + "df_by_title_year = df.groupby(by=['original_title', 'release_year']).id.count()\n", + "\n", + "nb_duplicates_title_year = df_by_title_year[df_by_title_year > 1].shape[0]\n", + "nb_total_movies = df.shape[0]\n", + "print(f\"{nb_duplicates_title_year} movies are note uniquely identify by original_title & release_year on {nb_total_movies} movies ({(100 * nb_duplicates_title_year / nb_total_movies):.2f}%)\")\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -99,12 +128,12 @@ "metadata": {}, "outputs": [], "source": [ - "df_movies.dropna(axis=0, how='any', subset=MOVIES_COLUMNS_OF_INTEREST, inplace=True)\n", + "df_movies_subset = df_movies.dropna(axis=0, how='any', subset=MOVIES_COLUMNS_OF_INTEREST)\n", "\n", - "df_movies_subset = df_movies[(df_movies['status'] == 'Released') & \n", - " (~df_movies['adult']) &\n", - " (df_movies['release_date'] < '2024-03-01') &\n", - " (df_movies['original_language'].isin(['fr', 'en']))].sort_values(by='release_date', ascending=False).iloc[0:NB_MOVIES_SUBSET]\n", + "df_movies_subset = df_movies_subset[(df_movies_subset['status'] == 'Released') & \n", + " (~df_movies_subset['adult']) &\n", + " (df_movies_subset['release_date'] < '2024-03-01') &\n", + " (df_movies_subset['original_language'].isin(['fr', 'en']))].sort_values(by='release_date', ascending=False).iloc[0:NB_MOVIES_SUBSET]\n", " \n", "df_movies_subset.to_csv(EXPORT_TMDB_SUBSETS_TO / \"tmdb_movies_subset.csv\")" ] @@ -116,12 +145,12 @@ "metadata": {}, "outputs": [], "source": [ - "df_tvshows.dropna(axis=0, how='any', subset=TVSHOWS_COLUMNS_OF_INTEREST, inplace=True)\n", + "df_tvshows_subset = df_tvshows.dropna(axis=0, how='any', subset=TVSHOWS_COLUMNS_OF_INTEREST)\n", "\n", - "df_tvshows_subset = df_tvshows[(df_tvshows['status'] == 'Ended') & \n", - " (~df_tvshows['adult']) &\n", - " (df_tvshows['last_air_date'] < '2024-03-01') &\n", - " (df_tvshows['original_language'].isin(['fr', 'en']))].sort_values(by='last_air_date', ascending=False).iloc[0:NB_TVSHOWS_SUBSET]\n", + "df_tvshows_subset = df_tvshows_subset[(df_tvshows_subset['status'] == 'Ended') & \n", + " (~df_tvshows_subset['adult']) &\n", + " (df_tvshows_subset['last_air_date'] < '2024-03-01') &\n", + " (df_tvshows_subset['original_language'].isin(['fr', 'en']))].sort_values(by='last_air_date', ascending=False).iloc[0:NB_TVSHOWS_SUBSET]\n", " \n", "df_tvshows_subset.to_csv(EXPORT_TMDB_SUBSETS_TO / \"tmdb_tvshows_subset.csv\")" ]