{ "cells": [ { "cell_type": "markdown", "id": "ef62b0fd-67b0-4ad3-bad0-bdaec434acaf", "metadata": {}, "source": [ "# Joining zonal data\n", "We split up into gap codes to compute zonal stats, so we need to join them into a single table again" ] }, { "cell_type": "code", "execution_count": null, "id": "e792c164-38bc-4821-823e-31274db6abec", "metadata": {}, "outputs": [], "source": [ "import ibis \n", "from ibis import _\n", "import os\n", "import sys\n", "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n", "if base_dir not in sys.path:\n", " sys.path.insert(0, base_dir)\n", " \n", "from minio_utils import * \n", "con, _ = connect_minio()" ] }, { "cell_type": "code", "execution_count": null, "id": "5c34a00b-ccea-4be8-add4-db242a6aba74", "metadata": { "scrolled": true }, "outputs": [], "source": [ "labels = ['gap1','gap2','gap3','gap4','nonconserved']\n", "\n", "for label in labels:\n", " names = ['id',\n", " 'pct_top_amphibian_richness','mean_amphibian_richness',\n", " 'pct_top_reptile_richness','mean_reptile_richness',\n", " 'pct_top_bird_richness','mean_bird_richness',\n", " 'pct_top_mammal_richness','mean_mammal_richness',\n", " 'pct_top_freshwater_richness','mean_top_freshwater_richness',\n", " 'pct_wetlands','pct_fire','pct_farmland','pct_grazing',\n", " 'pct_disadvantaged_community','pct_low_income_community',\n", " 'mean_plant_richness','pct_top_plant_richness'\n", " ]\n", " \n", " agg_dict = {\n", " name: _[name].first() for name in names\n", " }\n", " stats_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}/**.parquet'\n", " a = (con.read_parquet(stats_url, union_by_name = True)\n", " .drop('geom')\n", " .group_by('sub_id')\n", " .aggregate(**agg_dict)\n", " )\n", " \n", " url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/split_habitat_climate/{label}_habitat_climate.parquet'\n", " base = con.read_parquet(url)\n", " joined = base.inner_join(a,['sub_id','id'])\n", " save_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}_habitat_climate_stats.parquet'\n", " joined.to_parquet(save_url)" ] }, { "cell_type": "code", "execution_count": null, "id": "b6c2451f-aa6a-4d4e-b233-8b8b590fbfe0", "metadata": {}, "outputs": [], "source": [ "cols = ['id',\n", " 'sub_id',\n", " 'name',\n", " 'manager',\n", " 'manager_type',\n", " 'gap_code',\n", " 'status',\n", " 'land_tenure',\n", " 'access_type',\n", " 'county',\n", " 'ecoregion',\n", " 'habitat_type',\n", " 'climate_zone',\n", " 'mean_amphibian_richness',\n", " 'mean_reptile_richness',\n", " 'mean_bird_richness',\n", " 'mean_mammal_richness',\n", " 'mean_plant_richness',\n", " 'mean_freshwater_richness',\n", " 'pct_top_amphibian_richness',\n", " 'pct_top_reptile_richness',\n", " 'pct_top_bird_richness',\n", " 'pct_top_mammal_richness',\n", " 'pct_top_plant_richness',\n", " 'pct_top_freshwater_richness',\n", " 'pct_wetlands',\n", " 'pct_fire',\n", " 'pct_farmland',\n", " 'pct_grazing_lands',\n", " 'pct_disadvantaged_community',\n", " 'pct_low_income_community',\n", " 'acres',\n", " 'geom']" ] }, { "cell_type": "code", "execution_count": null, "id": "0c986c99-a403-4266-b031-3a882d93c1fe", "metadata": {}, "outputs": [], "source": [ "stats_joined_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/*_habitat_climate_stats.parquet'\n", "joined_stats = (con\n", " .read_parquet(stats_joined_url)\n", " .mutate(geom = _.geom.convert('epsg:3310','epsg:4326'))\n", " .rename(pct_grazing_lands = \"pct_grazing\")\n", " .mutate(gap_code = _.gap_code.substitute({'Non-Conservation Area':'None'}))\n", " .mutate(name = _.name.fill_null('None'))\n", " .mutate(manager = _.manager.fill_null('None'))\n", " .mutate(manager_type = _.manager_type.fill_null('None'))\n", " .mutate(gap_code = _.gap_code.fill_null('None'))\n", " .mutate(status = _.status.fill_null('None'))\n", " .mutate(land_tenure = _.land_tenure.fill_null('None'))\n", " .mutate(access_type = _.access_type.fill_null('None'))\n", " .mutate(county = _.county.fill_null('None'))\n", " .mutate(ecoregion = _.ecoregion.fill_null('None'))\n", " .mutate(habitat_type = _.habitat_type.fill_null('None'))\n", " .mutate(climate_zone = _.climate_zone.fill_null('None'))\n", ")\n", "\n", "url1 = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/ca30x30_habitat_climate_stats.parquet'\n", "url2= f's3://public-ca30x30/ca30x30_cbn_v3.parquet'\n", "data = joined_stats.select(cols).order_by('gap_code','county','name','id','sub_id')\n", "data.to_parquet(url1)\n", "data.to_parquet(url2)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }