Skip to content

Commit

Permalink
check if movies can be identify by original_title and release_year
Browse files Browse the repository at this point in the history
  • Loading branch information
machbry committed Mar 6, 2024
1 parent 9046263 commit c5b73e2
Showing 1 changed file with 42 additions and 13 deletions.
55 changes: 42 additions & 13 deletions notebooks/create_tmdb_subsets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@
"\n",
"EXPORT_TMDB_SUBSETS_TO = Path(\"../data/tmdb_subsets\").resolve()\n",
"\n",
"MOVIES_COLUMNS_OF_INTEREST = ['title', 'original_title', 'release_date', 'production_countries', 'genres', 'production_companies', 'vote_average', 'revenue']\n",
"MOVIES_COLUMNS_OF_INTEREST = ['title', 'original_title', 'release_date', 'production_countries', 'genres', 'production_companies']\n",
"NB_MOVIES_SUBSET = 5000\n",
"\n",
"TVSHOWS_COLUMNS_OF_INTEREST = ['name', 'original_name', 'first_air_date', 'production_countries', 'genres', 'production_companies', 'vote_average']\n",
"TVSHOWS_COLUMNS_OF_INTEREST = ['name', 'original_name', 'first_air_date', 'production_countries', 'genres', 'production_companies']\n",
"NB_TVSHOWS_SUBSET = 5000"
]
},
Expand Down Expand Up @@ -81,7 +81,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 13,
"id": "bf19fe99",
"metadata": {},
"outputs": [],
Expand All @@ -92,19 +92,48 @@
"df_tvshows = pd.read_csv(EXTRACT_TWSHOWS_ZIP_TO / os.listdir(EXTRACT_TWSHOWS_ZIP_TO)[0])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "ad152395",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6497 movies are note uniquely identify by original_title & release_year on 797541 movies (0.81%)\n"
]
}
],
"source": [
"df = df_movies[(df_movies['status'] == 'Released') & \n",
" (~df_movies['adult']) &\n",
" (~df_movies['release_date'].isna())].copy()\n",
"\n",
"df['release_year'] = df['release_date'].apply(lambda date : date[0:4]).astype(int)\n",
"\n",
"df_by_title_year = df.groupby(by=['original_title', 'release_year']).id.count()\n",
"\n",
"nb_duplicates_title_year = df_by_title_year[df_by_title_year > 1].shape[0]\n",
"nb_total_movies = df.shape[0]\n",
"print(f\"{nb_duplicates_title_year} movies are note uniquely identify by original_title & release_year on {nb_total_movies} movies ({(100 * nb_duplicates_title_year / nb_total_movies):.2f}%)\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "7429796d",
"metadata": {},
"outputs": [],
"source": [
"df_movies.dropna(axis=0, how='any', subset=MOVIES_COLUMNS_OF_INTEREST, inplace=True)\n",
"df_movies_subset = df_movies.dropna(axis=0, how='any', subset=MOVIES_COLUMNS_OF_INTEREST)\n",
"\n",
"df_movies_subset = df_movies[(df_movies['status'] == 'Released') & \n",
" (~df_movies['adult']) &\n",
" (df_movies['release_date'] < '2024-03-01') &\n",
" (df_movies['original_language'].isin(['fr', 'en']))].sort_values(by='release_date', ascending=False).iloc[0:NB_MOVIES_SUBSET]\n",
"df_movies_subset = df_movies_subset[(df_movies_subset['status'] == 'Released') & \n",
" (~df_movies_subset['adult']) &\n",
" (df_movies_subset['release_date'] < '2024-03-01') &\n",
" (df_movies_subset['original_language'].isin(['fr', 'en']))].sort_values(by='release_date', ascending=False).iloc[0:NB_MOVIES_SUBSET]\n",
" \n",
"df_movies_subset.to_csv(EXPORT_TMDB_SUBSETS_TO / \"tmdb_movies_subset.csv\")"
]
Expand All @@ -116,12 +145,12 @@
"metadata": {},
"outputs": [],
"source": [
"df_tvshows.dropna(axis=0, how='any', subset=TVSHOWS_COLUMNS_OF_INTEREST, inplace=True)\n",
"df_tvshows_subset = df_tvshows.dropna(axis=0, how='any', subset=TVSHOWS_COLUMNS_OF_INTEREST)\n",
"\n",
"df_tvshows_subset = df_tvshows[(df_tvshows['status'] == 'Ended') & \n",
" (~df_tvshows['adult']) &\n",
" (df_tvshows['last_air_date'] < '2024-03-01') &\n",
" (df_tvshows['original_language'].isin(['fr', 'en']))].sort_values(by='last_air_date', ascending=False).iloc[0:NB_TVSHOWS_SUBSET]\n",
"df_tvshows_subset = df_tvshows_subset[(df_tvshows_subset['status'] == 'Ended') & \n",
" (~df_tvshows_subset['adult']) &\n",
" (df_tvshows_subset['last_air_date'] < '2024-03-01') &\n",
" (df_tvshows_subset['original_language'].isin(['fr', 'en']))].sort_values(by='last_air_date', ascending=False).iloc[0:NB_TVSHOWS_SUBSET]\n",
" \n",
"df_tvshows_subset.to_csv(EXPORT_TMDB_SUBSETS_TO / \"tmdb_tvshows_subset.csv\")"
]
Expand Down

0 comments on commit c5b73e2

Please sign in to comment.