hydroshare · pkdash · Sep 13, 2024 · Sep 13, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/docs/api/csv.md b/docs/api/csv.md
@@ -0,0 +1 @@
+::: hsclient.hydroshare.CSVAggregation
diff --git a/docs/examples/Aggregation_Data_Object_Operations.ipynb b/docs/examples/Aggregation_Data_Object_Operations.ipynb
@@ -14,7 +14,8 @@
  " * Time series\n",
  " * Geographic feature\n",
  " * Geographic raster\n",
- " * Multidimensional NetCDF"
+ " * Multidimensional NetCDF\n",
+ " * CSV"
  ],
  "metadata": {
  "collapsed": false
@@ -84,8 +85,9 @@
  "* Geographic feature : fiona.Collection\n",
  "* Geographic raster : rasterio.DatasetReader\n",
  "* Multidimensional NetCDF : xarray.Dataset\n",
+ "* CSV : pandas.DataFrame\n",
  "\n",
- "In the following code examples, we are assuming that we have a resource in HydroShare that contains the above four aggregation types. All these aggregations are at the root of the resource. The resource id used in the following code examples is \"a0e0c2e2e5e84e1e9b6b2b2b2b2b2b2b\". You will need to change this resource id to the id of your resource in HydroShare.\n"
+ "In the following code examples, we are assuming that we have a resource in HydroShare that contains the above five aggregation types. All these aggregations are at the root of the resource. The resource id used in the following code examples is \"a0e0c2e2e5e84e1e9b6b2b2b2b2b2b2b\". You will need to change this resource id to the id of your resource in HydroShare.\n"
  ],
  "metadata": {
  "collapsed": false
@@ -936,6 +938,168 @@
  },
  "outputs": [],
  "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": [
+ "### Loading CSV Data to pandas.DataFrame\n",
+ "Here we are assuming the CSV aggregation contains a CSV file with name \"sample.csv\" "
+ ]
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# retrieve the CSV aggregation\n",
+ "file_path = \"sample.csv\"\n",
+ "csv_aggr = resource.aggregation(file__path=file_path)"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# show the aggregation type\n",
+ "print(f\"Aggregation Type:{csv_aggr.metadata.type}\")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# download the CSV aggregation - these directory paths must exist for hsclient to download and unzip the aggregation zip file\n",
+ "# Note: These directory paths need to be changed based on where you want to download the aggregation\n",
+ "download_to = os.path.join(base_working_dir, \"csv_testing\")\n",
+ "unzip_to = os.path.join(download_to, \"aggr_unzipped\")\n",
+ "aggr_path = resource.aggregation_download(aggregation=csv_aggr, save_path=download_to, unzip_to=unzip_to)\n",
+ "print(f\"Downloaded aggregation to:{aggr_path}\")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# load the CSV aggregation as pandas.DataFrame\n",
+ "csv_df = csv_aggr.as_data_object(agg_path=aggr_path)"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# show number of rows and columns\n",
+ "print(f\"Number of data rows:{len(csv_df)}\")\n",
+ "print(f\"Number of data columns:{len(csv_df.columns)}\")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# show the first 5 data rows\n",
+ "print(csv_df.head(5))"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# show the extracted CSV aggregation metadata (table schema)\n",
+ "table_schema = csv_aggr.metadata.tableSchema\n",
+ "table = table_schema.table\n",
+ "print(f\"Number of data rows:{table_schema.rows}\")\n",
+ "print(f\"Number of data columns:{len(table.columns)}\")\n",
+ "print(f\"Delimiter:{table_schema.delimiter}\")\n",
+ "\n",
+ "# show data column properties\n",
+ "for col in table.columns:\n",
+ " print(f\"Column number:{col.column_number}\")\n",
+ " print(f\"Column title:{col.title}\")\n",
+ " print(f\"Column description:{col.description}\")\n",
+ " print(f\"Column data type:{col.datatype}\")\n",
+ " print(\"-\"*50) "
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "***Editing CSV aggregation using pandas.DataFrame***"
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# drop the last data column - note all editing needs to be in 'inplace' mode\n",
+ "csv_df.drop(csv_df.columns[-1], axis=1, inplace=True)\n",
+ "# show the number of data columns after the edit\n",
+ "print(f\"Number of data columns after edit:{len(csv_df.columns)}\")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# save the updated CSV aggregation in Hydroshare\n",
+ "# Note this will overwrite the original aggregation - this operation may take a while\n",
+ "csv_aggr = csv_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=False)\n",
+ "print(\"Aggregation updated ...\")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# we can also create a new CSV aggregation in HydroShare using the updated pandas.DataFrame object\n",
+ "# we first create a new folder in which the new aggregation will be created\n",
+ "aggr_folder = \"csv_folder\"\n",
+ "resource.folder_create(folder=aggr_folder)\n",
+ "\n",
+ "# this operation may take a while\n",
+ "csv_aggr = csv_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=True, destination_path=aggr_folder)\n",
+ "print(\"New CSV aggregation was created ...\")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ "# retrieve the updated CSV aggregation to verify the data got updated\n",
+ "download_to = os.path.join(base_working_dir, \"csv_testing\")\n",
+ "\n",
+ "# note the unzip_to directory must exist and be empty\n",
+ "unzip_to = os.path.join(download_to, \"aggr_unzipped\")\n",
+ "aggr_path = resource.aggregation_download(aggregation=csv_aggr, save_path=download_to, unzip_to=unzip_to)\n",
+ "csv_df = csv_aggr.as_data_object(agg_path=aggr_path)\n",
+ "\n",
+ "# show the number of data rows and columns\n",
+ "print(f\"Number of data rows:{len(csv_df)}\")\n",
+ "print(f\"Number of data columns:{len(csv_df.columns)}\")\n",
+ "# show the first 5 data rows\n",
+ "print(csv_df.head(5))"
+ ],
+ "outputs": [],
+ "execution_count": null
  }
  ],
  "metadata": {