diff --git a/analyses/2019_03_aliamcami_issue_22/2019_03_issue_22_data_prep.ipynb b/analyses/2019_03_aliamcami_issue_22/2019_03_issue_22_data_prep.ipynb
new file mode 100644
index 0000000..b0bd7fd
--- /dev/null
+++ b/analyses/2019_03_aliamcami_issue_22/2019_03_issue_22_data_prep.ipynb
@@ -0,0 +1,976 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
+      "  data = yaml.load(f.read()) or {}\n",
+      "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
+      "  defaults = yaml.load(f)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table style=\"border: 2px solid white;\">\n",
+       "<tr>\n",
+       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
+       "<h3>Client</h3>\n",
+       "<ul>\n",
+       "  <li><b>Scheduler: </b>tcp://127.0.0.1:42879\n",
+       "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
+       "</ul>\n",
+       "</td>\n",
+       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
+       "<h3>Cluster</h3>\n",
+       "<ul>\n",
+       "  <li><b>Workers: </b>2</li>\n",
+       "  <li><b>Cores: </b>2</li>\n",
+       "  <li><b>Memory: </b>4.14 GB</li>\n",
+       "</ul>\n",
+       "</td>\n",
+       "</tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "<Client: scheduler='tcp://127.0.0.1:42879' processes=2 cores=2>"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import dask.dataframe as dd\n",
+    "from dask.distributed import Client\n",
+    "\n",
+    "#Initializing client\n",
+    "client = Client()\n",
+    "client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['value_1000', 'value_len'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "#read sample data\n",
+    "df = dd.read_parquet('../../databases/clean_unindexed.parquet', engine='pyarrow', columns=['value_1000', 'value_len'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1313.8734164674954, 0, 4496969, 25831.732898438877)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#printing some data so could have an overview of what len to expect\n",
+    "dfmean = df['value_len'].mean()\n",
+    "dfmin = df['value_len'].min()\n",
+    "dfmax = df['value_len'].max()\n",
+    "dfstd = df['value_len'].std()\n",
+    "mc = dd.compute(dfmean, dfmin, dfmax, dfstd);\n",
+    "print(mc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tldextract\n",
+    "def extract_domain(url):\n",
+    "    try:\n",
+    "        extracted = tldextract.extract(url)\n",
+    "        return '{}.{}'.format(extracted.domain, extracted.suffix)\n",
+    "    except Exception as e:\n",
+    "        return 'ERROR'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Get very simplified value domain to have a very simple overview of the domain unique count\n",
+    "#Doing like this does not mean all the value field is equal, just the begining, \n",
+    "#but for the propouse of this investigation this is good enough\n",
+    "#and processing just the \"domain\" is pretty much all I can do with the processing power I have at my disposal\n",
+    "df['value_domain'] = df['value_1000'].apply(extract_domain)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Save to parquet the processed values with the simplified domain column\n",
+    "#so next time I can read from file instead of waiting very long time to process this column \n",
+    "\n",
+    "df.to_parquet('df_value1000_domain.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>value_1000</th>\n",
+       "      <th>value_len</th>\n",
+       "      <th>value_domain</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...</td>\n",
+       "      <td>68</td>\n",
+       "      <td>Mozilla.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...</td>\n",
+       "      <td>68</td>\n",
+       "      <td>Mozilla.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...</td>\n",
+       "      <td>68</td>\n",
+       "      <td>Mozilla.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Netscape</td>\n",
+       "      <td>8</td>\n",
+       "      <td>Netscape.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5.0 (X11)</td>\n",
+       "      <td>9</td>\n",
+       "      <td>5.0 (X11).</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          value_1000  value_len value_domain\n",
+       "0  Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...         68     Mozilla.\n",
+       "1  Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...         68     Mozilla.\n",
+       "2  Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...         68     Mozilla.\n",
+       "3                                           Netscape          8    Netscape.\n",
+       "4                                          5.0 (X11)          9   5.0 (X11)."
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Read last saved data (I can start analysing from here instead of doing all previous steps)\n",
+    "hdf = dd.read_parquet('df_value1000_domain.parquet', engine='pyarrow')\n",
+    "hdf.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Get all rows that have the value_len above 2000 characters,\n",
+    "#since I want to identify what the really large values are\n",
+    "#I really dont have to worry about the small values and I can eliminate them here\n",
+    "\n",
+    "#Also, I'm not sure what \"really large values\" refers to, I'm just guesing that for sure its not something this small\n",
+    "f = hdf[hdf['value_len'] > 2000]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(value_1000      113790686\n",
+       " value_len       113790686\n",
+       " value_domain    113790686\n",
+       " dtype: int64, value_1000      7214922\n",
+       " value_len       7214922\n",
+       " value_domain    7214922\n",
+       " dtype: int64)"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#getting to know how many rows I'm dealing with now\n",
+    "len(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "834540.                              144483\n",
+       "{\"ScribeTransport\".                  119637\n",
+       "{\"insdrSV\".                          116564\n",
+       "{\"criteo_pt_cdb_metrics_expires\".     91008\n",
+       "3rlQ9p29SeCbCmQ934fYVA$0.             89735\n",
+       "H8wotBY9TFu5Q4Y3_iSQng$0.             77243\n",
+       "Rn8kD3YeRdCYuK-Ya7fNyw$0.             77085\n",
+       "{\"ins-today-sId\".                     73139\n",
+       "Na9BL8mAQgqyMAy1zxOlJg$0.             69490\n",
+       "Ytag3XR0TQWbMYl0wfPz1Q$0.             66683\n",
+       "Name: value_domain, dtype: int64"
+      ]
+     },
+     "execution_count": 72,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#getting to know how many unique domains I'm dealing with\n",
+    "c = f['value_domain'].value_counts().compute()\n",
+    "c.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: The signature of `Series.to_csv` was aligned to that of `DataFrame.to_csv`, and argument 'header' will change its default value from False to True: please pass an explicit value to suppress this warning.\n",
+      "  \"\"\"Entry point for launching an IPython kernel.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Again saving data on disk so I can access it when I want with no need to \n",
+    "#reprocess things since it takes too much time with the processing power I have\n",
+    "\n",
+    "# c.to_csv('value_domain_count.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#grouping by domain \n",
+    "group = f.groupby('value_domain')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Evaluating the mean, std, min, max and count for the value_len for each unique domain\n",
+    "result = group.agg({'value_len': ['mean', 'std', 'min', 'max', 'count']})\n",
+    "computed_result = result.compute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"5\" halign=\"left\">value_len</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "      <th>count</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>value_domain</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>🏳️‍🌈\\\",\\\"protected\\\".</th>\n",
+       "      <td>2020.571429</td>\n",
+       "      <td>17.386366</td>\n",
+       "      <td>2014</td>\n",
+       "      <td>2060</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>!xticcsep*!!xticcsep*!!xticcsep*!0\",\"lscache-.</th>\n",
+       "      <td>41685.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>41685</td>\n",
+       "      <td>41685</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>!xticcsep*!!xticcsep*!!xticcsep*!{\\\"version\\\".</th>\n",
+       "      <td>41994.000000</td>\n",
+       "      <td>1995.605283</td>\n",
+       "      <td>38750</td>\n",
+       "      <td>44639</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>\"BODY.</th>\n",
+       "      <td>2384.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>2384</td>\n",
+       "      <td>2384</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>%22%2C%22ct%22%3A%22Mountain%20View%22%2C%22co%22%3A%22United%20States%22%2C%22isp%22%3A%22Google%20Cloud%22%2C%22cn%22%3A%22North%20America%22%2C%22asn%22%3A15169%2C%22pr%22%3A%22California%22%2C%22pc%22%3A%2294043%22%2C%22org%22%3A%22Google%20Cloud%22%7D; chip_session=1; tfm_uid=sUHElOJ5CFCLO2OstOizfKprEkEQuE25; _cp=1555c065-bd10-4862-a519-c1c7b1b8fdfc; _.</th>\n",
+       "      <td>2078.253731</td>\n",
+       "      <td>70.801974</td>\n",
+       "      <td>2039</td>\n",
+       "      <td>2246</td>\n",
+       "      <td>67</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D0%26rnd%3D790973%22%5D; ip2l={%22country%22.</th>\n",
+       "      <td>2103.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>2103</td>\n",
+       "      <td>2103</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D1%26rnd%3D128284%22%5D; ip2l={%22country%22.</th>\n",
+       "      <td>2555.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>2555</td>\n",
+       "      <td>2555</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>%252520El%252520Otomobiller%252520%25253A%252520Otomobil%252520Arama%2526link%253DArama%2526region%253Dapplication%2526pageIDType%253D1%.</th>\n",
+       "      <td>2482.114286</td>\n",
+       "      <td>92.420269</td>\n",
+       "      <td>2347</td>\n",
+       "      <td>2571</td>\n",
+       "      <td>105</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>%3A1366\",\"_dyfs\".</th>\n",
+       "      <td>24857.266667</td>\n",
+       "      <td>260.844088</td>\n",
+       "      <td>24540</td>\n",
+       "      <td>25287</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>%3Ahttp%3A.</th>\n",
+       "      <td>283397.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>283397</td>\n",
+       "      <td>283397</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                        value_len  \\\n",
+       "                                                             mean   \n",
+       "value_domain                                                        \n",
+       " 🏳️‍🌈\\\",\\\"protected\\\".                                2020.571429   \n",
+       "!xticcsep*!!xticcsep*!!xticcsep*!0\",\"lscache-.       41685.000000   \n",
+       "!xticcsep*!!xticcsep*!!xticcsep*!{\\\"version\\\".       41994.000000   \n",
+       "\"BODY.                                                2384.000000   \n",
+       "%22%2C%22ct%22%3A%22Mountain%20View%22%2C%22co%...    2078.253731   \n",
+       "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D0%...    2103.000000   \n",
+       "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D1%...    2555.000000   \n",
+       "%252520El%252520Otomobiller%252520%25253A%25252...    2482.114286   \n",
+       "%3A1366\",\"_dyfs\".                                    24857.266667   \n",
+       "%3Ahttp%3A.                                         283397.000000   \n",
+       "\n",
+       "                                                                         \\\n",
+       "                                                            std     min   \n",
+       "value_domain                                                              \n",
+       " 🏳️‍🌈\\\",\\\"protected\\\".                                17.386366    2014   \n",
+       "!xticcsep*!!xticcsep*!!xticcsep*!0\",\"lscache-.         0.000000   41685   \n",
+       "!xticcsep*!!xticcsep*!!xticcsep*!{\\\"version\\\".      1995.605283   38750   \n",
+       "\"BODY.                                                 0.000000    2384   \n",
+       "%22%2C%22ct%22%3A%22Mountain%20View%22%2C%22co%...    70.801974    2039   \n",
+       "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D0%...     0.000000    2103   \n",
+       "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D1%...     0.000000    2555   \n",
+       "%252520El%252520Otomobiller%252520%25253A%25252...    92.420269    2347   \n",
+       "%3A1366\",\"_dyfs\".                                    260.844088   24540   \n",
+       "%3Ahttp%3A.                                            0.000000  283397   \n",
+       "\n",
+       "                                                                  \n",
+       "                                                       max count  \n",
+       "value_domain                                                      \n",
+       " 🏳️‍🌈\\\",\\\"protected\\\".                                2060     7  \n",
+       "!xticcsep*!!xticcsep*!!xticcsep*!0\",\"lscache-.       41685     2  \n",
+       "!xticcsep*!!xticcsep*!!xticcsep*!{\\\"version\\\".       44639    10  \n",
+       "\"BODY.                                                2384     6  \n",
+       "%22%2C%22ct%22%3A%22Mountain%20View%22%2C%22co%...    2246    67  \n",
+       "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D0%...    2103     2  \n",
+       "%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D1%...    2555     6  \n",
+       "%252520El%252520Otomobiller%252520%25253A%25252...    2571   105  \n",
+       "%3A1366\",\"_dyfs\".                                    25287    15  \n",
+       "%3Ahttp%3A.                                         283397     2  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "computed_result.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/pyarrow/pandas_compat.py:114: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n",
+      "  result = infer_dtype(pandas_collection)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#again saving result to eliminate the need of recomputing\n",
+    "computed_result['value_len'].to_parquet('computed_result.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#read computed results if needed\n",
+    "# computed_result = dd.read_parquet('computed_result.parquet', engine='pyarrow')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#I noticed that some domains have very little occurencies\n",
+    "#so I decided to analyse the ones that appear the most, \n",
+    "#or at least a number of times that its interesting to analyse\n",
+    "cr1 = computed_result[computed_result.value_len['count'] > 500]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#I was still left with too many rows,\n",
+    "#so I decided to filter little bit more by value_len \n",
+    "#(again, the objective here is to know what is in the really large values, \n",
+    "#so I dont think filtering the smaller ones is a problem, and it will make process so much faster)\n",
+    "crf = cr1[cr1.value_len['max'] > 5000]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"5\" halign=\"left\">value_len</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "      <th>count</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>value_domain</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0.</th>\n",
+       "      <td>6162.615723</td>\n",
+       "      <td>28838.128106</td>\n",
+       "      <td>2151</td>\n",
+       "      <td>452979</td>\n",
+       "      <td>2048</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1.</th>\n",
+       "      <td>6357.259101</td>\n",
+       "      <td>23466.783412</td>\n",
+       "      <td>2001</td>\n",
+       "      <td>165355</td>\n",
+       "      <td>934</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12_20171214163401_203aad7a\".</th>\n",
+       "      <td>137528.594893</td>\n",
+       "      <td>33.943976</td>\n",
+       "      <td>137448</td>\n",
+       "      <td>137611</td>\n",
+       "      <td>1723</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12_20171214163401_22fdafb1\".</th>\n",
+       "      <td>193743.342228</td>\n",
+       "      <td>15.341908</td>\n",
+       "      <td>193679</td>\n",
+       "      <td>193747</td>\n",
+       "      <td>9444</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12_20171214163401_2ee09a0c\".</th>\n",
+       "      <td>104294.223448</td>\n",
+       "      <td>10516.059642</td>\n",
+       "      <td>103878</td>\n",
+       "      <td>436505</td>\n",
+       "      <td>17127</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12_20171214163401_4ab79343\".</th>\n",
+       "      <td>193679.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>193679</td>\n",
+       "      <td>193679</td>\n",
+       "      <td>1236</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12_20171214163401_4f925001\".</th>\n",
+       "      <td>137545.389857</td>\n",
+       "      <td>29.873971</td>\n",
+       "      <td>137448</td>\n",
+       "      <td>137693</td>\n",
+       "      <td>631</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1438\".</th>\n",
+       "      <td>197677.328664</td>\n",
+       "      <td>163455.948478</td>\n",
+       "      <td>2884</td>\n",
+       "      <td>783332</td>\n",
+       "      <td>928</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1439\".</th>\n",
+       "      <td>162237.547330</td>\n",
+       "      <td>149440.117553</td>\n",
+       "      <td>2344</td>\n",
+       "      <td>492106</td>\n",
+       "      <td>824</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>195_af_lpdid\".</th>\n",
+       "      <td>3136.151181</td>\n",
+       "      <td>1389.901671</td>\n",
+       "      <td>2286</td>\n",
+       "      <td>6445</td>\n",
+       "      <td>635</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                  value_len                                 \\\n",
+       "                                       mean            std     min     max   \n",
+       "value_domain                                                                 \n",
+       "0.                              6162.615723   28838.128106    2151  452979   \n",
+       "1.                              6357.259101   23466.783412    2001  165355   \n",
+       "12_20171214163401_203aad7a\".  137528.594893      33.943976  137448  137611   \n",
+       "12_20171214163401_22fdafb1\".  193743.342228      15.341908  193679  193747   \n",
+       "12_20171214163401_2ee09a0c\".  104294.223448   10516.059642  103878  436505   \n",
+       "12_20171214163401_4ab79343\".  193679.000000       0.000000  193679  193679   \n",
+       "12_20171214163401_4f925001\".  137545.389857      29.873971  137448  137693   \n",
+       "1438\".                        197677.328664  163455.948478    2884  783332   \n",
+       "1439\".                        162237.547330  149440.117553    2344  492106   \n",
+       "195_af_lpdid\".                  3136.151181    1389.901671    2286    6445   \n",
+       "\n",
+       "                                     \n",
+       "                              count  \n",
+       "value_domain                         \n",
+       "0.                             2048  \n",
+       "1.                              934  \n",
+       "12_20171214163401_203aad7a\".   1723  \n",
+       "12_20171214163401_22fdafb1\".   9444  \n",
+       "12_20171214163401_2ee09a0c\".  17127  \n",
+       "12_20171214163401_4ab79343\".   1236  \n",
+       "12_20171214163401_4f925001\".    631  \n",
+       "1438\".                          928  \n",
+       "1439\".                          824  \n",
+       "195_af_lpdid\".                  635  "
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "crf.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "220"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#By this point I'm left with 220 unique domains, with I can actually analyse now\n",
+    "len(crf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/pyarrow/pandas_compat.py:698: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
+      "  labels = getattr(columns, 'labels', None) or [\n",
+      "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/pyarrow/pandas_compat.py:725: FutureWarning: the 'labels' keyword is deprecated, use 'codes' instead\n",
+      "  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)\n",
+      "/home/user/anaconda3/envs/overscripted/lib/python3.6/site-packages/pyarrow/pandas_compat.py:742: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
+      "  labels, = index.labels\n"
+     ]
+    }
+   ],
+   "source": [
+    "#I'm still not very familiar with neither dask or pandas, but I find pandas easier to use\n",
+    "#since I got a smaller dataset to analyse I'll use pandas for couple analyses from here on out\n",
+    "import pandas as pn\n",
+    "cc = pn.read_parquet('computed_result.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mean</th>\n",
+       "      <th>std</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "      <th>count</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>value_domain</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>{\"ScribeTransport\".</th>\n",
+       "      <td>4128.59</td>\n",
+       "      <td>1406.46</td>\n",
+       "      <td>2001</td>\n",
+       "      <td>7211</td>\n",
+       "      <td>93409</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>{\"ins-today-sId\".</th>\n",
+       "      <td>5037.69</td>\n",
+       "      <td>14446.52</td>\n",
+       "      <td>2002</td>\n",
+       "      <td>87748</td>\n",
+       "      <td>60426</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>{\"criteo_pt_cdb_metrics_expires\".</th>\n",
+       "      <td>9529.66</td>\n",
+       "      <td>53326.72</td>\n",
+       "      <td>2003</td>\n",
+       "      <td>692032</td>\n",
+       "      <td>47543</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>font-face{font-family.</th>\n",
+       "      <td>162363.28</td>\n",
+       "      <td>172503.75</td>\n",
+       "      <td>2634</td>\n",
+       "      <td>648067</td>\n",
+       "      <td>45059</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>{\"CLOUDFLARE.</th>\n",
+       "      <td>514484.07</td>\n",
+       "      <td>634151.12</td>\n",
+       "      <td>4356</td>\n",
+       "      <td>3253324</td>\n",
+       "      <td>42660</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>{\"__qubitUACategorisation\".</th>\n",
+       "      <td>64927.71</td>\n",
+       "      <td>105887.48</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>368966</td>\n",
+       "      <td>40003</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Na9BL8mAQgqyMAy1zxOlJg$0.</th>\n",
+       "      <td>2236.68</td>\n",
+       "      <td>178.84</td>\n",
+       "      <td>2001</td>\n",
+       "      <td>3312</td>\n",
+       "      <td>37945</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>935971.</th>\n",
+       "      <td>3726.06</td>\n",
+       "      <td>396.41</td>\n",
+       "      <td>3248</td>\n",
+       "      <td>4695</td>\n",
+       "      <td>33010</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>{\"insdrSV\".</th>\n",
+       "      <td>4026.30</td>\n",
+       "      <td>12823.05</td>\n",
+       "      <td>2002</td>\n",
+       "      <td>191041</td>\n",
+       "      <td>32981</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>834540.</th>\n",
+       "      <td>2218.71</td>\n",
+       "      <td>216.20</td>\n",
+       "      <td>2001</td>\n",
+       "      <td>2864</td>\n",
+       "      <td>32117</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                       mean       std   min      max  count\n",
+       "value_domain                                                               \n",
+       "{\"ScribeTransport\".                 4128.59   1406.46  2001     7211  93409\n",
+       "{\"ins-today-sId\".                   5037.69  14446.52  2002    87748  60426\n",
+       "{\"criteo_pt_cdb_metrics_expires\".   9529.66  53326.72  2003   692032  47543\n",
+       "font-face{font-family.            162363.28 172503.75  2634   648067  45059\n",
+       "{\"CLOUDFLARE.                     514484.07 634151.12  4356  3253324  42660\n",
+       "{\"__qubitUACategorisation\".        64927.71 105887.48  2018   368966  40003\n",
+       "Na9BL8mAQgqyMAy1zxOlJg$0.           2236.68    178.84  2001     3312  37945\n",
+       "935971.                             3726.06    396.41  3248     4695  33010\n",
+       "{\"insdrSV\".                         4026.30  12823.05  2002   191041  32981\n",
+       "834540.                             2218.71    216.20  2001     2864  32117"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#The top 5 occurencies\n",
+    "pn.options.display.float_format = '{:.2f}'.format\n",
+    "formated = cc.sort_values('count', ascending=False).head(10)\n",
+    "formated"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analyses/2019_03_aliamcami_issue_22/README.md b/analyses/2019_03_aliamcami_issue_22/README.md
new file mode 100644
index 0000000..8812680
--- /dev/null
+++ b/analyses/2019_03_aliamcami_issue_22/README.md
@@ -0,0 +1,24 @@
+# Huge Values Analysis
+## Goal
+Identify what's in the really large values. Proposed in issue [#22](https://github.com/mozilla/overscripted/issues/22).
+
+## Overview
+The dataset was grouped by the first keyword of value column (value_domain).
+Some statistical data was taken from the value_len for each row of each group. The result was filtered in search of the hugest values with most occurrencies. 
+Initial analysis were performed on "cloudflare" group since it holds the biggest min and max value_len. This demonstrated that the biggest values are structured scraped data in JSON format. 
+
+The top results, sorted by count, are listed bellow. 
+
+## Compiled Results: Top 10
+| value_domain                      | mean      | std       | min  | max     | count |
+|-----------------------------------|-----------|-----------|------|---------|-------|
+| {"ScribeTransport".               | 4128.59   | 1406.46   | 2001 | 7211    | 93409 |
+| {"ins-today-sId".                 | 5037.69   | 14446.52  | 2002 | 87748   | 60426 |
+| {"criteo_pt_cdb_metrics_expires". | 9529.66   | 53326.72  | 2003 | 692032  | 47543 |
+| font-face{font-family.            | 162363.28 | 172503.75 | 2634 | 648067  | 45059 |
+| {"CLOUDFLARE.                     | 514484.07 | 634151.12 | 4356 | 3253324 | 42660 |
+| {"__qubitUACategorisation".       | 64927.71  | 105887.48 | 2018 | 368966  | 40003 |
+| Na9BL8mAQgqyMAy1zxOlJg$0.         | 2236.68   | 178.84    | 2001 | 3312    | 37945 |
+| 935971.                           | 3726.06   | 396.41    | 3248 | 4695    | 33010 |
+| {"insdrSV".                       | 4026.30   | 12823.05  | 2002 | 191041  | 32981 |
+| 834540.                           | 2218.71   | 216.20    | 2001 | 2864    | 32117 |

	value_1000	value_len	value_domain
0	Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...	68	Mozilla.
1	Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...	68	Mozilla.
2	Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...	68	Mozilla.
3	Netscape	8	Netscape.
4	5.0 (X11)	9	5.0 (X11).
	value_len
	mean	std	min	max	count
value_domain
🏳️‍🌈\\\",\\\"protected\\\".	2020.571429	17.386366	2014	2060	7
!xticcsep!!xticcsep!!xticcsep*!0\",\"lscache-.	41685.000000	0.000000	41685	41685	2
!xticcsep!!xticcsep!!xticcsep*!{\\\"version\\\".	41994.000000	1995.605283	38750	44639	10
\"BODY.	2384.000000	0.000000	2384	2384	6
%22%2C%22ct%22%3A%22Mountain%20View%22%2C%22co%22%3A%22United%20States%22%2C%22isp%22%3A%22Google%20Cloud%22%2C%22cn%22%3A%22North%20America%22%2C%22asn%22%3A15169%2C%22pr%22%3A%22California%22%2C%22pc%22%3A%2294043%22%2C%22org%22%3A%22Google%20Cloud%22%7D; chip_session=1; tfm_uid=sUHElOJ5CFCLO2OstOizfKprEkEQuE25; _cp=1555c065-bd10-4862-a519-c1c7b1b8fdfc; _.	2078.253731	70.801974	2039	2246	67
%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D0%26rnd%3D790973%22%5D; ip2l={%22country%22.	2103.000000	0.000000	2103	2103	2
%2520ADFS%253F%2520%257C%2520Okta%26_biz_n%3D1%26rnd%3D128284%22%5D; ip2l={%22country%22.	2555.000000	0.000000	2555	2555	6
%252520El%252520Otomobiller%252520%25253A%252520Otomobil%252520Arama%2526link%253DArama%2526region%253Dapplication%2526pageIDType%253D1%.	2482.114286	92.420269	2347	2571	105
%3A1366\",\"_dyfs\".	24857.266667	260.844088	24540	25287	15
%3Ahttp%3A.	283397.000000	0.000000	283397	283397	2
	mean	std	min	max	count
value_domain
{\"ScribeTransport\".	4128.59	1406.46	2001	7211	93409
{\"ins-today-sId\".	5037.69	14446.52	2002	87748	60426
{\"criteo_pt_cdb_metrics_expires\".	9529.66	53326.72	2003	692032	47543
font-face{font-family.	162363.28	172503.75	2634	648067	45059
{\"CLOUDFLARE.	514484.07	634151.12	4356	3253324	42660
{\"__qubitUACategorisation\".	64927.71	105887.48	2018	368966	40003
Na9BL8mAQgqyMAy1zxOlJg$0.	2236.68	178.84	2001	3312	37945
935971.	3726.06	396.41	3248	4695	33010
{\"insdrSV\".	4026.30	12823.05	2002	191041	32981
834540.	2218.71	216.20	2001	2864	32117