diff --git a/analyses/2019_03_aliamcami_issue_22/2019_03_issue_22_data_prep.ipynb b/analyses/2019_03_aliamcami_issue_22/2019_03_issue_22_data_prep.ipynb new file mode 100644 index 0000000..b0bd7fd --- /dev/null +++ b/analyses/2019_03_aliamcami_issue_22/2019_03_issue_22_data_prep.ipynb @@ -0,0 +1,976 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " data = yaml.load(f.read()) or {}\n", + "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " defaults = yaml.load(f)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 2
  • \n", + "
  • Cores: 2
  • \n", + "
  • Memory: 4.14 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dask.dataframe as dd\n", + "from dask.distributed import Client\n", + "\n", + "#Initializing client\n", + "client = Client()\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['value_1000', 'value_len'], dtype='object')\n" + ] + } + ], + "source": [ + "#read sample data\n", + "df = dd.read_parquet('../../databases/clean_unindexed.parquet', engine='pyarrow', columns=['value_1000', 'value_len'])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1313.8734164674954, 0, 4496969, 25831.732898438877)\n" + ] + } + ], + "source": [ + "#printing some data so could have an overview of what len to expect\n", + "dfmean = df['value_len'].mean()\n", + "dfmin = df['value_len'].min()\n", + "dfmax = df['value_len'].max()\n", + "dfstd = df['value_len'].std()\n", + "mc = dd.compute(dfmean, dfmin, dfmax, dfstd);\n", + "print(mc)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "import tldextract\n", + "def extract_domain(url):\n", + " try:\n", + " extracted = tldextract.extract(url)\n", + " return '{}.{}'.format(extracted.domain, extracted.suffix)\n", + " except Exception as e:\n", + " return 'ERROR'" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "#Get very simplified value domain to have a very simple overview of the domain unique count\n", + "#Doing like this does not mean all the value field is equal, just the begining, \n", + "#but for the propouse of this investigation this is good enough\n", + "#and processing just the \"domain\" is pretty much all I can do with the processing power I have at my disposal\n", + "df['value_domain'] = df['value_1000'].apply(extract_domain)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Save to parquet the processed values with the simplified domain column\n", + "#so next time I can read from file instead of waiting very long time to process this column \n", + "\n", + "df.to_parquet('df_value1000_domain.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_1000value_lenvalue_domain
0Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...68Mozilla.
1Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...68Mozilla.
2Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...68Mozilla.
3Netscape8Netscape.
45.0 (X11)95.0 (X11).
\n", + "
" + ], + "text/plain": [ + " value_1000 value_len value_domain\n", + "0 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... 68 Mozilla.\n", + "1 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... 68 Mozilla.\n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... 68 Mozilla.\n", + "3 Netscape 8 Netscape.\n", + "4 5.0 (X11) 9 5.0 (X11)." + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Read last saved data (I can start analysing from here instead of doing all previous steps)\n", + "hdf = dd.read_parquet('df_value1000_domain.parquet', engine='pyarrow')\n", + "hdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Get all rows that have the value_len above 2000 characters,\n", + "#since I want to identify what the really large values are\n", + "#I really dont have to worry about the small values and I can eliminate them here\n", + "\n", + "#Also, I'm not sure what \"really large values\" refers to, I'm just guesing that for sure its not something this small\n", + "f = hdf[hdf['value_len'] > 2000]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(value_1000 113790686\n", + " value_len 113790686\n", + " value_domain 113790686\n", + " dtype: int64, value_1000 7214922\n", + " value_len 7214922\n", + " value_domain 7214922\n", + " dtype: int64)" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#getting to know how many rows I'm dealing with now\n", + "len(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "834540. 144483\n", + "{\"ScribeTransport\". 119637\n", + "{\"insdrSV\". 116564\n", + "{\"criteo_pt_cdb_metrics_expires\". 91008\n", + "3rlQ9p29SeCbCmQ934fYVA$0. 89735\n", + "H8wotBY9TFu5Q4Y3_iSQng$0. 77243\n", + "Rn8kD3YeRdCYuK-Ya7fNyw$0. 77085\n", + "{\"ins-today-sId\". 73139\n", + "Na9BL8mAQgqyMAy1zxOlJg$0. 69490\n", + "Ytag3XR0TQWbMYl0wfPz1Q$0. 66683\n", + "Name: value_domain, dtype: int64" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#getting to know how many unique domains I'm dealing with\n", + "c = f['value_domain'].value_counts().compute()\n", + "c.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: The signature of `Series.to_csv` was aligned to that of `DataFrame.to_csv`, and argument 'header' will change its default value from False to True: please pass an explicit value to suppress this warning.\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + } + ], + "source": [ + "#Again saving data on disk so I can access it when I want with no need to \n", + "#reprocess things since it takes too much time with the processing power I have\n", + "\n", + "# c.to_csv('value_domain_count.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#grouping by domain \n", + "group = f.groupby('value_domain')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#Evaluating the mean, std, min, max and count for the value_len for each unique domain\n", + "result = group.agg({'value_len': ['mean', 'std', 'min', 'max', 'count']})\n", + "computed_result = result.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_len
meanstdminmaxcount
value_domain
🏳️‍🌈\\\",\\\"protected\\\".2020.57142917.386366201420607
!xticcsep*!!xticcsep*!!xticcsep*!0\",\"lscache-.41685.0000000.00000041685416852
!xticcsep*!!xticcsep*!!xticcsep*!{\\\"version\\\".41994.0000001995.605283387504463910
\"BODY.2384.0000000.000000238423846
%22%2C%22ct%22%3A%22Mountain%20View%22%2C%22co%22%3A%22United%20States%22%2C%22isp%22%3A%22Google%20Cloud%22%2C%22cn%22%3A%22North%20America%22%2C%22asn%22%3A15169%2C%22pr%22%3A%22California%22%2C%22pc%22%3A%2294043%22%2C%22org%22%3A%22Google%20Cloud%22%7D; chip_session=1; tfm_uid=sUHElOJ5CFCLO2OstOizfKprEkEQuE25; _cp=1555c065-bd10-4862-a519-c1c7b1b8fdfc; _.2078.25373170.8019742039224667
%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D0%26rnd%3D790973%22%5D; ip2l={%22country%22.2103.0000000.000000210321032
%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D1%26rnd%3D128284%22%5D; ip2l={%22country%22.2555.0000000.000000255525556
%252520El%252520Otomobiller%252520%25253A%252520Otomobil%252520Arama%2526link%253DArama%2526region%253Dapplication%2526pageIDType%253D1%.2482.11428692.42026923472571105
%3A1366\",\"_dyfs\".24857.266667260.844088245402528715
%3Ahttp%3A.283397.0000000.0000002833972833972
\n", + "
" + ], + "text/plain": [ + " value_len \\\n", + " mean \n", + "value_domain \n", + " 🏳️‍🌈\\\",\\\"protected\\\". 2020.571429 \n", + "!xticcsep*!!xticcsep*!!xticcsep*!0\",\"lscache-. 41685.000000 \n", + "!xticcsep*!!xticcsep*!!xticcsep*!{\\\"version\\\". 41994.000000 \n", + "\"BODY. 2384.000000 \n", + "%22%2C%22ct%22%3A%22Mountain%20View%22%2C%22co%... 2078.253731 \n", + "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D0%... 2103.000000 \n", + "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D1%... 2555.000000 \n", + "%252520El%252520Otomobiller%252520%25253A%25252... 2482.114286 \n", + "%3A1366\",\"_dyfs\". 24857.266667 \n", + "%3Ahttp%3A. 283397.000000 \n", + "\n", + " \\\n", + " std min \n", + "value_domain \n", + " 🏳️‍🌈\\\",\\\"protected\\\". 17.386366 2014 \n", + "!xticcsep*!!xticcsep*!!xticcsep*!0\",\"lscache-. 0.000000 41685 \n", + "!xticcsep*!!xticcsep*!!xticcsep*!{\\\"version\\\". 1995.605283 38750 \n", + "\"BODY. 0.000000 2384 \n", + "%22%2C%22ct%22%3A%22Mountain%20View%22%2C%22co%... 70.801974 2039 \n", + "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D0%... 0.000000 2103 \n", + "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D1%... 0.000000 2555 \n", + "%252520El%252520Otomobiller%252520%25253A%25252... 92.420269 2347 \n", + "%3A1366\",\"_dyfs\". 260.844088 24540 \n", + "%3Ahttp%3A. 0.000000 283397 \n", + "\n", + " \n", + " max count \n", + "value_domain \n", + " 🏳️‍🌈\\\",\\\"protected\\\". 2060 7 \n", + "!xticcsep*!!xticcsep*!!xticcsep*!0\",\"lscache-. 41685 2 \n", + "!xticcsep*!!xticcsep*!!xticcsep*!{\\\"version\\\". 44639 10 \n", + "\"BODY. 2384 6 \n", + "%22%2C%22ct%22%3A%22Mountain%20View%22%2C%22co%... 2246 67 \n", + "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D0%... 2103 2 \n", + "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D1%... 2555 6 \n", + "%252520El%252520Otomobiller%252520%25253A%25252... 2571 105 \n", + "%3A1366\",\"_dyfs\". 25287 15 \n", + "%3Ahttp%3A. 283397 2 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "computed_result.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/pyarrow/pandas_compat.py:114: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n", + " result = infer_dtype(pandas_collection)\n" + ] + } + ], + "source": [ + "#again saving result to eliminate the need of recomputing\n", + "computed_result['value_len'].to_parquet('computed_result.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#read computed results if needed\n", + "# computed_result = dd.read_parquet('computed_result.parquet', engine='pyarrow')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "#I noticed that some domains have very little occurencies\n", + "#so I decided to analyse the ones that appear the most, \n", + "#or at least a number of times that its interesting to analyse\n", + "cr1 = computed_result[computed_result.value_len['count'] > 500]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "#I was still left with too many rows,\n", + "#so I decided to filter little bit more by value_len \n", + "#(again, the objective here is to know what is in the really large values, \n", + "#so I dont think filtering the smaller ones is a problem, and it will make process so much faster)\n", + "crf = cr1[cr1.value_len['max'] > 5000]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_len
meanstdminmaxcount
value_domain
0.6162.61572328838.12810621514529792048
1.6357.25910123466.7834122001165355934
12_20171214163401_203aad7a\".137528.59489333.9439761374481376111723
12_20171214163401_22fdafb1\".193743.34222815.3419081936791937479444
12_20171214163401_2ee09a0c\".104294.22344810516.05964210387843650517127
12_20171214163401_4ab79343\".193679.0000000.0000001936791936791236
12_20171214163401_4f925001\".137545.38985729.873971137448137693631
1438\".197677.328664163455.9484782884783332928
1439\".162237.547330149440.1175532344492106824
195_af_lpdid\".3136.1511811389.90167122866445635
\n", + "
" + ], + "text/plain": [ + " value_len \\\n", + " mean std min max \n", + "value_domain \n", + "0. 6162.615723 28838.128106 2151 452979 \n", + "1. 6357.259101 23466.783412 2001 165355 \n", + "12_20171214163401_203aad7a\". 137528.594893 33.943976 137448 137611 \n", + "12_20171214163401_22fdafb1\". 193743.342228 15.341908 193679 193747 \n", + "12_20171214163401_2ee09a0c\". 104294.223448 10516.059642 103878 436505 \n", + "12_20171214163401_4ab79343\". 193679.000000 0.000000 193679 193679 \n", + "12_20171214163401_4f925001\". 137545.389857 29.873971 137448 137693 \n", + "1438\". 197677.328664 163455.948478 2884 783332 \n", + "1439\". 162237.547330 149440.117553 2344 492106 \n", + "195_af_lpdid\". 3136.151181 1389.901671 2286 6445 \n", + "\n", + " \n", + " count \n", + "value_domain \n", + "0. 2048 \n", + "1. 934 \n", + "12_20171214163401_203aad7a\". 1723 \n", + "12_20171214163401_22fdafb1\". 9444 \n", + "12_20171214163401_2ee09a0c\". 17127 \n", + "12_20171214163401_4ab79343\". 1236 \n", + "12_20171214163401_4f925001\". 631 \n", + "1438\". 928 \n", + "1439\". 824 \n", + "195_af_lpdid\". 635 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crf.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "220" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#By this point I'm left with 220 unique domains, with I can actually analyse now\n", + "len(crf)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/pyarrow/pandas_compat.py:698: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n", + " labels = getattr(columns, 'labels', None) or [\n", + "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/pyarrow/pandas_compat.py:725: FutureWarning: the 'labels' keyword is deprecated, use 'codes' instead\n", + " return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)\n", + "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/pyarrow/pandas_compat.py:742: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n", + " labels, = index.labels\n" + ] + } + ], + "source": [ + "#I'm still not very familiar with neither dask or pandas, but I find pandas easier to use\n", + "#since I got a smaller dataset to analyse I'll use pandas for couple analyses from here on out\n", + "import pandas as pn\n", + "cc = pn.read_parquet('computed_result.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
meanstdminmaxcount
value_domain
{\"ScribeTransport\".4128.591406.462001721193409
{\"ins-today-sId\".5037.6914446.5220028774860426
{\"criteo_pt_cdb_metrics_expires\".9529.6653326.72200369203247543
font-face{font-family.162363.28172503.75263464806745059
{\"CLOUDFLARE.514484.07634151.124356325332442660
{\"__qubitUACategorisation\".64927.71105887.48201836896640003
Na9BL8mAQgqyMAy1zxOlJg$0.2236.68178.842001331237945
935971.3726.06396.413248469533010
{\"insdrSV\".4026.3012823.05200219104132981
834540.2218.71216.202001286432117
\n", + "
" + ], + "text/plain": [ + " mean std min max count\n", + "value_domain \n", + "{\"ScribeTransport\". 4128.59 1406.46 2001 7211 93409\n", + "{\"ins-today-sId\". 5037.69 14446.52 2002 87748 60426\n", + "{\"criteo_pt_cdb_metrics_expires\". 9529.66 53326.72 2003 692032 47543\n", + "font-face{font-family. 162363.28 172503.75 2634 648067 45059\n", + "{\"CLOUDFLARE. 514484.07 634151.12 4356 3253324 42660\n", + "{\"__qubitUACategorisation\". 64927.71 105887.48 2018 368966 40003\n", + "Na9BL8mAQgqyMAy1zxOlJg$0. 2236.68 178.84 2001 3312 37945\n", + "935971. 3726.06 396.41 3248 4695 33010\n", + "{\"insdrSV\". 4026.30 12823.05 2002 191041 32981\n", + "834540. 2218.71 216.20 2001 2864 32117" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#The top 5 occurencies\n", + "pn.options.display.float_format = '{:.2f}'.format\n", + "formated = cc.sort_values('count', ascending=False).head(10)\n", + "formated" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/analyses/2019_03_aliamcami_issue_22/README.md b/analyses/2019_03_aliamcami_issue_22/README.md new file mode 100644 index 0000000..8812680 --- /dev/null +++ b/analyses/2019_03_aliamcami_issue_22/README.md @@ -0,0 +1,24 @@ +# Huge Values Analysis +## Goal +Identify what's in the really large values. Proposed in issue [#22](https://github.com/mozilla/overscripted/issues/22). + +## Overview +The dataset was grouped by the first keyword of value column (value_domain). +Some statistical data was taken from the value_len for each row of each group. The result was filtered in search of the hugest values with most occurrencies. +Initial analysis were performed on "cloudflare" group since it holds the biggest min and max value_len. This demonstrated that the biggest values are structured scraped data in JSON format. + +The top results, sorted by count, are listed bellow. + +## Compiled Results: Top 10 +| value_domain | mean | std | min | max | count | +|-----------------------------------|-----------|-----------|------|---------|-------| +| {"ScribeTransport". | 4128.59 | 1406.46 | 2001 | 7211 | 93409 | +| {"ins-today-sId". | 5037.69 | 14446.52 | 2002 | 87748 | 60426 | +| {"criteo_pt_cdb_metrics_expires". | 9529.66 | 53326.72 | 2003 | 692032 | 47543 | +| font-face{font-family. | 162363.28 | 172503.75 | 2634 | 648067 | 45059 | +| {"CLOUDFLARE. | 514484.07 | 634151.12 | 4356 | 3253324 | 42660 | +| {"__qubitUACategorisation". | 64927.71 | 105887.48 | 2018 | 368966 | 40003 | +| Na9BL8mAQgqyMAy1zxOlJg$0. | 2236.68 | 178.84 | 2001 | 3312 | 37945 | +| 935971. | 3726.06 | 396.41 | 3248 | 4695 | 33010 | +| {"insdrSV". | 4026.30 | 12823.05 | 2002 | 191041 | 32981 | +| 834540. | 2218.71 | 216.20 | 2001 | 2864 | 32117 |