diff --git a/.gitignore b/.gitignore index 49a0bee..345c1ce 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ .virtualenv .vscode +.ruff_cache/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 54b8ddb..1a29b80 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: - id: black # Ruff - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.1.15 + rev: v0.2.1 hooks: - id: ruff args: [--fix] diff --git a/backend/api_app/controllers/apps.py b/backend/api_app/controllers/apps.py index 381151c..17b98a5 100644 --- a/backend/api_app/controllers/apps.py +++ b/backend/api_app/controllers/apps.py @@ -1,5 +1,11 @@ +"""API for app data. + +/apps/{store_id} a specific app +""" + import datetime import urllib.parse +from typing import Self import numpy as np import pandas as pd @@ -13,9 +19,11 @@ Category, Collection, DeveloperApps, + PackageDetails, ) -from config import get_logger +from config import AD_NETWORK_PACKAGE_IDS, TRACKER_PACKAGE_IDS, get_logger from dbcon.queries import ( + get_app_package_details, get_single_app, query_app_history, query_ranks_for_app, @@ -26,10 +34,6 @@ logger = get_logger(__name__) -""" -/apps/{store_id} a specific app -""" - def get_search_results(search_term: str) -> AppGroup: """Parse search term and return resulting APpGroup.""" @@ -42,17 +46,20 @@ def get_search_results(search_term: str) -> AppGroup: def get_app_history(app_dict: dict) -> dict: + """Get the history of app scraping.""" store_app = app_dict["id"] app_name = app_dict["name"] app_hist = query_app_history(store_app) - app_dict["histogram"] = app_hist.sort_values(["id"]).tail(1)["histogram"].values[0] + app_dict["histogram"] = ( + app_hist.sort_values(["id"]).tail(1)["histogram"].to_numpy()[0] + ) app_dict["history_table"] = app_hist.drop(["id", "store_app"], axis=1).to_dict( orient="records", ) app_hist["group"] = app_name app_hist = app_hist[ - ~((app_hist["installs"].isnull()) & (app_hist["rating_count"].isnull())) + ~((app_hist["installs"].isna()) & (app_hist["rating_count"].isna())) ] metrics = ["installs", "rating", "review_count", "rating_count"] group_col = "group" @@ -60,7 +67,7 @@ def get_app_history(app_dict: dict) -> dict: app_hist = app_hist.sort_values(xaxis_col) app_hist["date_change"] = app_hist[xaxis_col] - app_hist[xaxis_col].shift(1) app_hist["days_changed"] = app_hist["date_change"].apply( - lambda x: np.nan if pd.isnull(x) else x.days, + lambda x: np.nan if pd.isna(x) else x.days, ) change_metrics = [] for metric in metrics: @@ -95,7 +102,6 @@ def get_app_history(app_dict: dict) -> dict: }, ) ) - # TODO: KEEP? app_hist = app_hist.replace([np.inf, -np.inf], np.nan) app_hist = app_hist.dropna(axis="columns", how="all") if app_hist.empty: @@ -110,7 +116,6 @@ def get_app_history(app_dict: dict) -> dict: change_dicts = [] for metric in final_metrics: meltdf = mymelt.loc[mymelt.group == metric] - # meltdf = meltdf.rename(columns={"value": "percentage_value"}) melteddicts = meltdf.to_dict(orient="records") if "Rate of Change" in metric: change_dicts += melteddicts @@ -121,7 +126,8 @@ def get_app_history(app_dict: dict) -> dict: def get_string_date_from_days_ago(days: int) -> str: - mydate = datetime.datetime.utcnow() - datetime.timedelta(days=days) + """Get the stringified date from x days ago.""" + mydate = datetime.datetime.now(datetime.UTC) - datetime.timedelta(days=days) mydate_str = mydate.strftime("%Y-%m-%d") return mydate_str @@ -157,10 +163,13 @@ def get_app_overview_dict(collection: str) -> Collection: class AppController(Controller): + + """Controller holding all API endpoints for an app.""" + path = "/api/apps" @get(path="/collections/{collection:str}", cache=3600) - async def get_apps_overview(self, collection: str) -> Collection: + async def get_apps_overview(self: Self, collection: str) -> Collection: """Handle GET request for a list of apps. Args: @@ -173,14 +182,13 @@ async def get_apps_overview(self, collection: str) -> Collection: """ logger.info(f"{self.path} start {collection=}") - # print(f"collection={collection}") home_dict = get_app_overview_dict(collection=collection) logger.info(f"{self.path} return") return home_dict @get(path="/{store_id:str}", cache=3600) - async def get_app_detail(self, store_id: str) -> AppDetail: + async def get_app_detail(self: Self, store_id: str) -> AppDetail: """Handle GET request for a specific app. store_id (str): The id of the app to retrieve. @@ -193,8 +201,9 @@ async def get_app_detail(self, store_id: str) -> AppDetail: logger.info(f"{self.path} start") app_df = get_single_app(store_id) if app_df.empty: + msg = f"Store ID not found: {store_id!r}" raise NotFoundException( - f"Store ID not found: {store_id!r}", + msg, status_code=404, ) app_dict = app_df.to_dict(orient="records")[0] @@ -202,9 +211,76 @@ async def get_app_detail(self, store_id: str) -> AppDetail: app_dict["historyData"] = app_hist_dict return app_dict + @get(path="/{store_id:str}/packageinfo", cache=3600) + async def get_package_info(self: Self, store_id: str) -> PackageDetails: + """Handle GET request for a specific app. + + store_id (str): The id of the app to retrieve. + + Returns + ------- + json + + """ + logger.info(f"{self.path} start") + + df = get_app_package_details(store_id) + + if df.empty: + msg = f"Store ID not found: {store_id!r}" + raise NotFoundException( + msg, + status_code=404, + ) + + is_permission = df["xml_path"] == "uses-permission" + is_matching_packages = df["android_name"].str.startswith( + ".".join(store_id.split(".")[:2]), + ) + + is_android_activity = df["android_name"].str.contains( + r"^(com.android)|(android)", + ) + trackers = ")|(".join(TRACKER_PACKAGE_IDS) + trackers = f"^({trackers})" + is_tracker = df["android_name"].str.contains( + trackers, + ) + + ads = ")|(".join(AD_NETWORK_PACKAGE_IDS) + ads = f"({ads})" + is_ads = df["android_name"].str.contains( + ads, + ) + + permissions_df = df[is_permission] + android_services_df = df[is_android_activity] + tracker_df = df[is_tracker] + ads_df = df[is_ads] + + left_overs_df = df[ + ~is_permission + & ~is_matching_packages + & ~is_android_activity + & ~is_tracker + & ~is_ads + ] + permissions_list = permissions_df.android_name.tolist() + permissions_list = [ + x.replace("android.permission.", "") for x in permissions_list + ] + trackers_dict = PackageDetails( + trackers=tracker_df.android_name.tolist(), + permissions=permissions_list, + ads=ads_df.android_name.tolist(), + android=android_services_df.android_name.tolist(), + leftovers=left_overs_df.android_name.tolist(), + ) + return trackers_dict + @get(path="/{store_id:str}/ranks", cache=3600) - async def app_ranks(self, store_id: str) -> AppRank: - """Handles a GET request for a specific app ranks. + async def app_ranks(self: Self, store_id: str) -> AppRank: + """Handle GET requests for a specific app ranks. Args: ---- @@ -218,8 +294,9 @@ async def app_ranks(self, store_id: str) -> AppRank: logger.info(f"{self.path} start") df = query_ranks_for_app(store_id=store_id) if df.empty: + msg = f"Ranks not found for {store_id!r}" raise NotFoundException( - f"Ranks not found for {store_id!r}", + msg, status_code=404, ) df["rank_group"] = df["collection"] + ": " + df["category"] @@ -238,8 +315,8 @@ async def app_ranks(self, store_id: str) -> AppRank: return rank_dict @get(path="/developers/{developer_id:str}", cache=3600) - async def get_developer_apps(self, developer_id: str) -> DeveloperApps: - """Handles a GET request for a specific developer. + async def get_developer_apps(self: Self, developer_id: str) -> DeveloperApps: + """Handle GET request for a specific developer. Args: ---- @@ -254,8 +331,9 @@ async def get_developer_apps(self, developer_id: str) -> DeveloperApps: apps_df = query_single_developer(developer_id) if apps_df.empty: + msg = f"Store ID not found: {developer_id!r}" raise NotFoundException( - f"Store ID not found: {developer_id!r}", + msg, status_code=404, ) developer_name = apps_df.to_dict(orient="records")[0]["developer_name"] @@ -269,7 +347,14 @@ async def get_developer_apps(self, developer_id: str) -> DeveloperApps: return developer_apps @get(path="/search/{search_term:str}", cache=3600) - async def search(self, search_term: str) -> AppGroup: + async def search(self: Self, search_term: str) -> AppGroup: + """Search apps and developers. + + Args: + ---- + search_term: str the search term to search for. Can search packages, developers and app names. + + """ logger.info(f"{self.path} term={search_term}") apps_dict = get_search_results(search_term) diff --git a/backend/api_app/models.py b/backend/api_app/models.py index 4b62da5..4a4fdee 100644 --- a/backend/api_app/models.py +++ b/backend/api_app/models.py @@ -3,12 +3,25 @@ @dataclass class AppDetail: + """TODO: Fill out all details.""" store_id: str name: str +@dataclass +class PackageDetails: + + """Lists of android_name from Manifest.""" + + permissions: list[str] + trackers: list[str] + ads: list[str] + leftovers: list[str] + android: list[str] + + @dataclass class AppGroup: title: str # iOS or Google diff --git a/backend/config/__init__.py b/backend/config/__init__.py index 645ff0c..d681958 100644 --- a/backend/config/__init__.py +++ b/backend/config/__init__.py @@ -94,3 +94,11 @@ def get_logger(mod_name: str, log_name: str = "dash"): CONFIG = tomllib.load(f) DATE_FORMAT = "%Y-%m-%d" + +AD_NETWORK_PACKAGE_IDS = [ + "com.ironsource", + "com.unity3d.services", + "com.unity3d.ads", + "com.applovin", +] +TRACKER_PACKAGE_IDS = ["com.appsflyer", "com.facebook", "com.adjust", "com.kochava"] diff --git a/backend/dbcon/connections.py b/backend/dbcon/connections.py index b07dc26..76810bf 100644 --- a/backend/dbcon/connections.py +++ b/backend/dbcon/connections.py @@ -1,5 +1,4 @@ from sqlalchemy import create_engine -from sshtunnel import SSHTunnelForwarder from config import CONFIG, get_logger @@ -7,6 +6,8 @@ def open_ssh_tunnel(server_name: str): + from sshtunnel import SSHTunnelForwarder + with SSHTunnelForwarder( (CONFIG[server_name]["host"], 22), # Remote server IP and SSH port ssh_username=CONFIG[server_name]["os_user"], @@ -46,6 +47,7 @@ def get_postgres_server_ips(server_name: str) -> tuple[str, str]: class PostgresCon: + """Class for managing the connection to postgres Parameters diff --git a/backend/dbcon/queries.py b/backend/dbcon/queries.py index 8bd491b..ea24a21 100644 --- a/backend/dbcon/queries.py +++ b/backend/dbcon/queries.py @@ -308,7 +308,7 @@ def get_appstore_categories() -> pd.DataFrame: return df -def query_ranks_for_app(store_id: str, days=30) -> pd.DataFrame: +def query_ranks_for_app(store_id: str, days: int = 30) -> pd.DataFrame: start_date = ( datetime.datetime.now(datetime.UTC) - datetime.timedelta(days=days) ).strftime("%Y-%m-%d") @@ -459,6 +459,7 @@ def get_category_top_apps_by_installs(category: str, limit: int = 10) -> pd.Data def get_single_app(store_id: str) -> pd.DataFrame: + """Get basic app details for a single store_id.""" logger.info(f"Query for single app_id={store_id}") where_str = f"WHERE store_id = '{store_id}'" where_stmt: TextClause = text(where_str) @@ -483,7 +484,51 @@ def get_single_app(store_id: str) -> pd.DataFrame: return df +def get_app_package_details(store_id: str) -> pd.DataFrame: + """Get basic app details for a single store_id.""" + logger.info(f"Query for single app_id={store_id}") + where_str = f"store_id = '{store_id}'" + app_where_stmt: TextClause = text(where_str) + sel_query = f"""WITH latest_version_codes AS ( + SELECT + vc.store_app, + MAX(vc.version_code) AS max_version_code + FROM + version_codes AS vc + GROUP BY + vc.store_app + ) + SELECT + vc.store_app, + sa.store_id, + vd.* + FROM + version_details AS vd + LEFT JOIN + version_codes AS vc ON + vd.version_code = vc.id + INNER JOIN + latest_version_codes AS lvc ON + vc.store_app = lvc.store_app + AND vc.version_code = lvc.max_version_code + LEFT JOIN store_apps sa ON + sa.id = vc.store_app + WHERE + vd.android_name != '' + AND + {app_where_stmt} + ORDER BY + store_app, + xml_path, + android_name + ; + """ + df = pd.read_sql(sel_query, DBCON.engine) + return df + + def clean_app_df(df: pd.DataFrame) -> pd.DataFrame: + """Apply generic cleaning for a DF with app data from store_apps table.""" df["store"] = df["store"].replace({1: "Google Play", 2: "Apple App Store"}) string_nums = ["installs", "review_count", "rating_count"] for col in string_nums: @@ -616,6 +661,47 @@ def search_apps(search_input: str, limit: int = 100) -> pd.DataFrame: return df +def get_manifest_names() -> pd.DataFrame: + """Get manifest data. + + Data is pulled for some apks and extracted from the AndroidManifest.xml + """ + sel_query = """WITH latest_version_codes AS ( + SELECT + vc.store_app, + MAX(vc.version_code) AS max_version_code + FROM + version_codes AS vc + GROUP BY + vc.store_app + ) + SELECT + vc.store_app, + sa.store_id, + vd.* + FROM + version_details AS vd + LEFT JOIN + version_codes AS vc ON + vd.version_code = vc.id + INNER JOIN + latest_version_codes AS lvc ON + vc.store_app = lvc.store_app + AND vc.version_code = lvc.max_version_code + LEFT JOIN store_apps sa ON + sa.id = vc.store_app + WHERE + vd.android_name != '' + ORDER BY + store_app, + xml_path, + android_name + ; + """ + df = pd.read_sql(sel_query, DBCON.engine) + return df + + logger.info("set db engine") DBCON = get_db_connection("madrone") DBCON.set_engine() diff --git a/backend/pyproject.toml b/backend/pyproject.toml index f8c9399..246a0c4 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -5,9 +5,9 @@ description = "Backend server for app store dashboard" version = "0.0.1" [project.optional-dependencies] -dev = ["pre-commit", "psycopg2-binary"] +dev = ["pre-commit", "psycopg2-binary", "sshtunnel"] -dependencies = ["uvicorn", "litestar[standard]"] +dependencies = ["litestar[standard]", "pandas", "numpy", "sqlalchemy"] [build-system] requires = ["setuptools", "wheel"] @@ -15,22 +15,24 @@ requires = ["setuptools", "wheel"] [tool.ruff] #select = ["E", "W", "F", "B", "I", "N", "UP"] -select = ["ALL"] +lint.select = ["ALL"] target-version = 'py312' -ignore = [ +lint.ignore = [ "E501", # line length "W291", # trailing space at end of line "G", # Ignore pyflake logging as it does not like f-strings in log messages. "RET504", # Unnecessary assignment to variable before return, but I prefer for readability. "PD901", # avoid 'df' for dataframe, but I prefer this as the main df in each function + "D211", # Suppress ruff internal conflict + "D213", # Suppress ruff internal conflict ] # Avoid trying to fix flake8-bugbear (`B`) violations. -unfixable = ["B"] +lint.unfixable = ["B"] -fixable = ["ALL"] +lint.fixable = ["ALL"] exclude = ["tests/*"] diff --git a/frontend/.prettierrc b/frontend/.prettierrc index a77fdde..de7ec1b 100644 --- a/frontend/.prettierrc +++ b/frontend/.prettierrc @@ -3,7 +3,6 @@ "singleQuote": true, "trailingComma": "none", "printWidth": 100, - "plugins": ["prettier-plugin-svelte"], - "pluginSearchDirs": ["."], + "plugins": ["./node_modules/prettier-plugin-svelte/plugin.js"], "overrides": [{ "files": "*.svelte", "options": { "parser": "svelte" } }] } diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 074de54..3a0351b 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -36,7 +36,7 @@ "eslint": "^8.52.0", "eslint-config-prettier": "^9.1.0", "eslint-plugin-svelte": "^2.4.0", - "prettier": "^3.2.4", + "prettier": "^3.2.5", "prettier-plugin-svelte": "^3.1.2", "svelte-check": "^3.5.2" } @@ -3885,9 +3885,9 @@ } }, "node_modules/prettier": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.2.4.tgz", - "integrity": "sha512-FWu1oLHKCrtpO1ypU6J0SbK2d9Ckwysq6bHj/uaCP26DxrPpppCLQRGVuqAxSTvhF00AcvDRyYrLNW7ocBhFFQ==", + "version": "3.2.5", + "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.2.5.tgz", + "integrity": "sha512-3/GWa9aOC0YeD7LUfvOG2NiDyhOWRvt1k+rcKhOuYnMY24iiCphgneUfJDyFXd6rZCAnuLBv6UeAULtrhT/F4A==", "dev": true, "bin": { "prettier": "bin/prettier.cjs" diff --git a/frontend/package.json b/frontend/package.json index e64a190..0f96600 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -18,7 +18,7 @@ "eslint": "^8.52.0", "eslint-config-prettier": "^9.1.0", "eslint-plugin-svelte": "^2.4.0", - "prettier": "^3.2.4", + "prettier": "^3.2.5", "prettier-plugin-svelte": "^3.1.2", "svelte-check": "^3.5.2" }, diff --git a/frontend/src/routes/apps/[id]/+page.server.ts b/frontend/src/routes/apps/[id]/+page.server.ts index 415d132..9f6188f 100644 --- a/frontend/src/routes/apps/[id]/+page.server.ts +++ b/frontend/src/routes/apps/[id]/+page.server.ts @@ -14,6 +14,9 @@ export const load: PageServerLoad = async ({ params, setHeaders, locals }) => { const res = fetch(`http://localhost:8000/api/apps/${id}`); const ranks = fetch(`http://localhost:8000/api/apps/${id}/ranks`); + + const packageInfo = fetch(`http://localhost:8000/api/apps/${id}/packageinfo`); + try { return { myapp: { @@ -57,6 +60,27 @@ export const load: PageServerLoad = async ({ params, setHeaders, locals }) => { return 'Uncaught Error'; } ) + }, + myPackageInfo: { + streamed: packageInfo + .then((resp) => { + if (resp.status === 200) { + return resp.json(); + } else if (resp.status === 404) { + console.log('App Ranks Not found'); + return 'App Not Found'; + } else if (resp.status === 500) { + console.log('Ranks API Server error'); + return 'Backend Error'; + } + }) + .then( + (json) => json, + (error) => { + console.log('Uncaught error', error); + return 'Uncaught Error'; + } + ) } }; } catch (error) { diff --git a/frontend/src/routes/apps/[id]/+page.svelte b/frontend/src/routes/apps/[id]/+page.svelte index 7522507..b98c1a8 100644 --- a/frontend/src/routes/apps/[id]/+page.svelte +++ b/frontend/src/routes/apps/[id]/+page.svelte @@ -225,12 +225,46 @@

First Crawled: {appdata.created_at}

+ + {#await data.myPackageInfo.streamed} + Loading ... + {:then packageInfo} + {#if typeof packageInfo == 'string'} +

Permissions info not yet available for this app.

+ {:else} + {#if packageInfo.permissions && packageInfo.permissions.length > 0} +

Permissions

+
+ {#each packageInfo.permissions as permission} +

{permission}

+ {/each} +
+ {/if} + {#if packageInfo.trackers && packageInfo.trackers.length > 0} +

Trackers

+
+ {#each packageInfo.trackers as tracker} +

{tracker}

+ {/each} +
+ {/if} + {#if packageInfo.ads && packageInfo.ads.length > 0} +

Ad Networks

+
+ {#each packageInfo.ads as ad} +

{ad}

+ {/each} +
+ {/if} + {/if} + {/await} {/if} {:catch}

The server caught an error. Please try again or try another app.

{/await} +

Back to Home