Spaces:
Build error
Build error
Commit
·
5a3c665
1
Parent(s):
6ab4321
fixed! now using different method for h3.
Browse files- preprocess/CBN-data.ipynb +47 -76
- preprocess/h3_utils.py +61 -159
- preprocess/preprocess.ipynb +13 -94
- preprocess/utils.py +1 -17
preprocess/CBN-data.ipynb
CHANGED
|
@@ -48,11 +48,11 @@
|
|
| 48 |
"folder = 'Counties'\n",
|
| 49 |
"name = 'CA_counties'\n",
|
| 50 |
"\n",
|
| 51 |
-
"unzip(s3, folder = folder, file = '30x30_Counties.zip')\n",
|
| 52 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
|
| 53 |
-
"convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 54 |
"\n",
|
| 55 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)"
|
| 56 |
]
|
| 57 |
},
|
| 58 |
{
|
|
@@ -76,11 +76,10 @@
|
|
| 76 |
"\n",
|
| 77 |
"folder = 'Climate_zones'\n",
|
| 78 |
"name = 'climate_zones_10'\n",
|
| 79 |
-
"download(s3, folder = folder, file = 'clusters_10.tif')\n",
|
| 80 |
"cols = process_raster(s3, folder = folder, file = 'clusters_10.tif', file_name = f\"{name}.tif\")\n",
|
| 81 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols
|
| 82 |
-
"\n"
|
| 83 |
-
"\n"
|
| 84 |
]
|
| 85 |
},
|
| 86 |
{
|
|
@@ -105,10 +104,10 @@
|
|
| 105 |
"folder = 'Ecoregion'\n",
|
| 106 |
"name = 'ACE_ecoregions'\n",
|
| 107 |
"\n",
|
| 108 |
-
"unzip(s3, folder = folder, file = '30x30_Ecoregions.zip')\n",
|
| 109 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
|
| 110 |
"\n",
|
| 111 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)"
|
| 112 |
]
|
| 113 |
},
|
| 114 |
{
|
|
@@ -127,24 +126,11 @@
|
|
| 127 |
"#### 13 class major habitat types **"
|
| 128 |
]
|
| 129 |
},
|
| 130 |
-
{
|
| 131 |
-
"cell_type": "code",
|
| 132 |
-
"execution_count": null,
|
| 133 |
-
"id": "df40e121-e2d4-4962-9c30-ed7e931bb705",
|
| 134 |
-
"metadata": {},
|
| 135 |
-
"outputs": [],
|
| 136 |
-
"source": [
|
| 137 |
-
"# download(folder = 'Habitat', file = 'CWHR13_2022.tif')\n",
|
| 138 |
-
"# cols = process_raster(s3, folder = 'Habitat', file = 'CWHR13_2022.tif')"
|
| 139 |
-
]
|
| 140 |
-
},
|
| 141 |
{
|
| 142 |
"cell_type": "code",
|
| 143 |
"execution_count": null,
|
| 144 |
"id": "de501ac3-f6fe-44f5-86c1-afba763147ae",
|
| 145 |
-
"metadata": {
|
| 146 |
-
"scrolled": true
|
| 147 |
-
},
|
| 148 |
"outputs": [],
|
| 149 |
"source": [
|
| 150 |
"%%time\n",
|
|
@@ -153,7 +139,6 @@
|
|
| 153 |
"\n",
|
| 154 |
"folder = 'Habitat'\n",
|
| 155 |
"name = 'fveg22_1'\n",
|
| 156 |
-
"\n",
|
| 157 |
"# unzip(s3, folder = folder, file = 'fveg221gdb.zip')\n",
|
| 158 |
"\n",
|
| 159 |
"# command = [\n",
|
|
@@ -167,24 +152,8 @@
|
|
| 167 |
"cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
|
| 168 |
"# upload(folder = folder, file = f'{name}_processed.tif.aux.xml')\n",
|
| 169 |
"\n",
|
| 170 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols
|
| 171 |
-
|
| 172 |
-
},
|
| 173 |
-
{
|
| 174 |
-
"cell_type": "code",
|
| 175 |
-
"execution_count": null,
|
| 176 |
-
"id": "ac178c43-f6a5-4286-a348-48bfcb1e9397",
|
| 177 |
-
"metadata": {},
|
| 178 |
-
"outputs": [],
|
| 179 |
-
"source": [
|
| 180 |
-
"# url = f\"s3://public-ca30x30/{folder}/{name}.parquet\"\n",
|
| 181 |
-
"\n",
|
| 182 |
-
"folder = 'Habitat'\n",
|
| 183 |
-
"name = 'fveg22_1'\n",
|
| 184 |
-
"url = f\"s3://public-ca30x30/CBN-data/{folder}/hex/{name}.parquet\"\n",
|
| 185 |
-
"\n",
|
| 186 |
-
"con.read_parquet(url).head(5).execute()\n",
|
| 187 |
-
"\n"
|
| 188 |
]
|
| 189 |
},
|
| 190 |
{
|
|
@@ -223,7 +192,7 @@
|
|
| 223 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.geojson\")\n",
|
| 224 |
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.geojson\")\n",
|
| 225 |
"\n",
|
| 226 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n",
|
| 227 |
"# gdf = gpd.read_parquet(f\"{name}.parquet\")\n"
|
| 228 |
]
|
| 229 |
},
|
|
@@ -332,7 +301,7 @@
|
|
| 332 |
"\n",
|
| 333 |
"# download(s3, folder = folder, file = f\"{name}.tif\")\n",
|
| 334 |
"cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
|
| 335 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
|
| 336 |
]
|
| 337 |
},
|
| 338 |
{
|
|
@@ -359,7 +328,7 @@
|
|
| 359 |
"\n",
|
| 360 |
"download(s3, folder = folder, file = f\"{name}.tif\")\n",
|
| 361 |
"cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
|
| 362 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
|
| 363 |
]
|
| 364 |
},
|
| 365 |
{
|
|
@@ -402,7 +371,8 @@
|
|
| 402 |
"name = 'rcn_wIntactBioCat_caOnly_2020-10-27'\n",
|
| 403 |
"\n",
|
| 404 |
"cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
|
| 405 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols
|
|
|
|
| 406 |
]
|
| 407 |
},
|
| 408 |
{
|
|
@@ -491,7 +461,7 @@
|
|
| 491 |
"\n",
|
| 492 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
|
| 493 |
"convert_pmtiles(con, s3, folder =folder, file = f\"{name}.parquet\")\n",
|
| 494 |
-
"geom_to_h3(con, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
|
| 495 |
]
|
| 496 |
},
|
| 497 |
{
|
|
@@ -594,29 +564,29 @@
|
|
| 594 |
"set_secrets(con)\n",
|
| 595 |
"\n",
|
| 596 |
"folder = 'NBS_agriculture/Farmland'\n",
|
| 597 |
-
"unzip(s3, folder = folder, file = 'Important_Farmland_2018.zip')\n",
|
| 598 |
"\n",
|
| 599 |
"folder = 'NBS_agriculture/Farmland_all'\n",
|
| 600 |
"name = 'Important_Farmland_2018'\n",
|
| 601 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.gdb\",crs = \"epsg:4326\")\n",
|
| 602 |
-
"convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
|
| 603 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n",
|
| 604 |
"\n",
|
| 605 |
"# only pick a subset \n",
|
| 606 |
"folder = 'NBS_agriculture/Farmland_all/Farmland'\n",
|
| 607 |
"name = 'Farmland_2018'\n",
|
| 608 |
-
"gdf = gpd.read_file('Important_Farmland_2018.gdb')\n",
|
| 609 |
-
"farmland_type = ['P','S','L','U'] # prime, statewide importance, local importance, unique\n",
|
| 610 |
-
"gdf_farmland = gdf[gdf['polygon_ty'].isin(farmland_type)]\n",
|
| 611 |
-
"cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_farmland)\n",
|
| 612 |
-
"convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
|
| 613 |
"\n",
|
| 614 |
"# grazing lands \n",
|
| 615 |
"folder = 'NBS_agriculture/Farmland_all/Lands_suitable_grazing'\n",
|
| 616 |
"name = 'Grazing_land_2018'\n",
|
| 617 |
-
"gdf_grazing = gdf[gdf['polygon_ty'] == 'G']\n",
|
| 618 |
-
"cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_grazing)\n",
|
| 619 |
-
"convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n"
|
| 620 |
]
|
| 621 |
},
|
| 622 |
{
|
|
@@ -673,9 +643,9 @@
|
|
| 673 |
"# gdf['YEAR_'] = gdf['YEAR_'].astype('int64')\n",
|
| 674 |
"# gdf = gdf[gdf['YEAR_']>=2014]\n",
|
| 675 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
|
| 676 |
-
"convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 677 |
"\n",
|
| 678 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)"
|
| 679 |
]
|
| 680 |
},
|
| 681 |
{
|
|
@@ -759,10 +729,10 @@
|
|
| 759 |
"folder = 'Progress_data_new_protection/Newly_counted_lands'\n",
|
| 760 |
"name = 'newly_counted_lands_2024'\n",
|
| 761 |
"\n",
|
| 762 |
-
"unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
|
| 763 |
-
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
|
| 764 |
-
"convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 765 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
|
| 766 |
]
|
| 767 |
},
|
| 768 |
{
|
|
@@ -790,7 +760,7 @@
|
|
| 790 |
"unzip(s3, folder = folder, file = 'sb535dacgdbf2022gdb.zip')\n",
|
| 791 |
"cols = process_vector(s3, folder = folder, file = 'SB535DACgdb_F_2022.gdb', file_name = f\"{name}.parquet\")\n",
|
| 792 |
"convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 793 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
|
| 794 |
]
|
| 795 |
},
|
| 796 |
{
|
|
@@ -814,17 +784,17 @@
|
|
| 814 |
"\n",
|
| 815 |
"folder = 'Progress_data_new_protection/Priority_populations'\n",
|
| 816 |
"name = 'CalEnviroScreen4'\n",
|
| 817 |
-
"unzip(s3, folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
|
| 818 |
"\n",
|
| 819 |
"gdf = (con.read_geo('Priority Populations 4.0 Combined Layer.gdb')\n",
|
| 820 |
" .mutate(id=ibis.row_number().over()) #making a unique id \n",
|
| 821 |
" ).execute().set_crs('EPSG:3857')\n",
|
| 822 |
"\n",
|
| 823 |
-
"cols = process_vector(folder = folder, file = 'Priority Populations 4.0 Combined Layer.gdb',\n",
|
| 824 |
" file_name = f\"{name}.parquet\", gdf = gdf)\n",
|
| 825 |
"\n",
|
| 826 |
-
"convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 827 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
|
| 828 |
]
|
| 829 |
},
|
| 830 |
{
|
|
@@ -903,13 +873,14 @@
|
|
| 903 |
"set_secrets(con)\n",
|
| 904 |
"\n",
|
| 905 |
"# file = 'ca-30x30-base.parquet'\n",
|
| 906 |
-
"folder = \"Preprocessing\"\n",
|
| 907 |
"name = 'ca-30x30-base'\n",
|
| 908 |
-
"
|
| 909 |
"\n",
|
| 910 |
"# gdf = gpd.read_parquet(f\"{name}.parquet\")\n",
|
| 911 |
-
"cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 912 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols
|
|
|
|
| 913 |
]
|
| 914 |
},
|
| 915 |
{
|
|
@@ -930,21 +901,21 @@
|
|
| 930 |
"con = ibis.duckdb.connect('cpad',extensions = [\"spatial\", \"h3\"])\n",
|
| 931 |
"set_secrets(con)\n",
|
| 932 |
"\n",
|
| 933 |
-
"folder = '
|
| 934 |
"name = 'cced_2024b_release'\n",
|
| 935 |
"\n",
|
| 936 |
"# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
|
| 937 |
"# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
|
| 938 |
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 939 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
|
| 940 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols)\n",
|
| 941 |
"\n",
|
| 942 |
"name = 'cpad_2024b_release'\n",
|
| 943 |
"# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
|
| 944 |
"# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
|
| 945 |
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 946 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
|
| 947 |
-
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols)"
|
| 948 |
]
|
| 949 |
}
|
| 950 |
],
|
|
|
|
| 48 |
"folder = 'Counties'\n",
|
| 49 |
"name = 'CA_counties'\n",
|
| 50 |
"\n",
|
| 51 |
+
"# unzip(s3, folder = folder, file = '30x30_Counties.zip')\n",
|
| 52 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
|
| 53 |
+
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 54 |
"\n",
|
| 55 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)"
|
| 56 |
]
|
| 57 |
},
|
| 58 |
{
|
|
|
|
| 76 |
"\n",
|
| 77 |
"folder = 'Climate_zones'\n",
|
| 78 |
"name = 'climate_zones_10'\n",
|
| 79 |
+
"# download(s3, folder = folder, file = 'clusters_10.tif')\n",
|
| 80 |
"cols = process_raster(s3, folder = folder, file = 'clusters_10.tif', file_name = f\"{name}.tif\")\n",
|
| 81 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols,\n",
|
| 82 |
+
" zoom = 8)\n"
|
|
|
|
| 83 |
]
|
| 84 |
},
|
| 85 |
{
|
|
|
|
| 104 |
"folder = 'Ecoregion'\n",
|
| 105 |
"name = 'ACE_ecoregions'\n",
|
| 106 |
"\n",
|
| 107 |
+
"# unzip(s3, folder = folder, file = '30x30_Ecoregions.zip')\n",
|
| 108 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
|
| 109 |
"\n",
|
| 110 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)"
|
| 111 |
]
|
| 112 |
},
|
| 113 |
{
|
|
|
|
| 126 |
"#### 13 class major habitat types **"
|
| 127 |
]
|
| 128 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
{
|
| 130 |
"cell_type": "code",
|
| 131 |
"execution_count": null,
|
| 132 |
"id": "de501ac3-f6fe-44f5-86c1-afba763147ae",
|
| 133 |
+
"metadata": {},
|
|
|
|
|
|
|
| 134 |
"outputs": [],
|
| 135 |
"source": [
|
| 136 |
"%%time\n",
|
|
|
|
| 139 |
"\n",
|
| 140 |
"folder = 'Habitat'\n",
|
| 141 |
"name = 'fveg22_1'\n",
|
|
|
|
| 142 |
"# unzip(s3, folder = folder, file = 'fveg221gdb.zip')\n",
|
| 143 |
"\n",
|
| 144 |
"# command = [\n",
|
|
|
|
| 152 |
"cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
|
| 153 |
"# upload(folder = folder, file = f'{name}_processed.tif.aux.xml')\n",
|
| 154 |
"\n",
|
| 155 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols,\n",
|
| 156 |
+
" zoom = 8)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
]
|
| 158 |
},
|
| 159 |
{
|
|
|
|
| 192 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.geojson\")\n",
|
| 193 |
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.geojson\")\n",
|
| 194 |
"\n",
|
| 195 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n",
|
| 196 |
"# gdf = gpd.read_parquet(f\"{name}.parquet\")\n"
|
| 197 |
]
|
| 198 |
},
|
|
|
|
| 301 |
"\n",
|
| 302 |
"# download(s3, folder = folder, file = f\"{name}.tif\")\n",
|
| 303 |
"cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
|
| 304 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols, zoom = 8)"
|
| 305 |
]
|
| 306 |
},
|
| 307 |
{
|
|
|
|
| 328 |
"\n",
|
| 329 |
"download(s3, folder = folder, file = f\"{name}.tif\")\n",
|
| 330 |
"cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
|
| 331 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols, zoom = 8)"
|
| 332 |
]
|
| 333 |
},
|
| 334 |
{
|
|
|
|
| 371 |
"name = 'rcn_wIntactBioCat_caOnly_2020-10-27'\n",
|
| 372 |
"\n",
|
| 373 |
"cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
|
| 374 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols, \n",
|
| 375 |
+
" zoom = 8)"
|
| 376 |
]
|
| 377 |
},
|
| 378 |
{
|
|
|
|
| 461 |
"\n",
|
| 462 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
|
| 463 |
"convert_pmtiles(con, s3, folder =folder, file = f\"{name}.parquet\")\n",
|
| 464 |
+
"geom_to_h3(con, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n"
|
| 465 |
]
|
| 466 |
},
|
| 467 |
{
|
|
|
|
| 564 |
"set_secrets(con)\n",
|
| 565 |
"\n",
|
| 566 |
"folder = 'NBS_agriculture/Farmland'\n",
|
| 567 |
+
"# unzip(s3, folder = folder, file = 'Important_Farmland_2018.zip')\n",
|
| 568 |
"\n",
|
| 569 |
"folder = 'NBS_agriculture/Farmland_all'\n",
|
| 570 |
"name = 'Important_Farmland_2018'\n",
|
| 571 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.gdb\",crs = \"epsg:4326\")\n",
|
| 572 |
+
"# convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
|
| 573 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n",
|
| 574 |
"\n",
|
| 575 |
"# only pick a subset \n",
|
| 576 |
"folder = 'NBS_agriculture/Farmland_all/Farmland'\n",
|
| 577 |
"name = 'Farmland_2018'\n",
|
| 578 |
+
"# gdf = gpd.read_file('Important_Farmland_2018.gdb')\n",
|
| 579 |
+
"# farmland_type = ['P','S','L','U'] # prime, statewide importance, local importance, unique\n",
|
| 580 |
+
"# gdf_farmland = gdf[gdf['polygon_ty'].isin(farmland_type)]\n",
|
| 581 |
+
"# cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_farmland)\n",
|
| 582 |
+
"# convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
|
| 583 |
"\n",
|
| 584 |
"# grazing lands \n",
|
| 585 |
"folder = 'NBS_agriculture/Farmland_all/Lands_suitable_grazing'\n",
|
| 586 |
"name = 'Grazing_land_2018'\n",
|
| 587 |
+
"# gdf_grazing = gdf[gdf['polygon_ty'] == 'G']\n",
|
| 588 |
+
"# cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_grazing)\n",
|
| 589 |
+
"# convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n"
|
| 590 |
]
|
| 591 |
},
|
| 592 |
{
|
|
|
|
| 643 |
"# gdf['YEAR_'] = gdf['YEAR_'].astype('int64')\n",
|
| 644 |
"# gdf = gdf[gdf['YEAR_']>=2014]\n",
|
| 645 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
|
| 646 |
+
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 647 |
"\n",
|
| 648 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)"
|
| 649 |
]
|
| 650 |
},
|
| 651 |
{
|
|
|
|
| 729 |
"folder = 'Progress_data_new_protection/Newly_counted_lands'\n",
|
| 730 |
"name = 'newly_counted_lands_2024'\n",
|
| 731 |
"\n",
|
| 732 |
+
"# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
|
| 733 |
+
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\",crs = \"epsg:4326\")\n",
|
| 734 |
+
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 735 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n"
|
| 736 |
]
|
| 737 |
},
|
| 738 |
{
|
|
|
|
| 760 |
"unzip(s3, folder = folder, file = 'sb535dacgdbf2022gdb.zip')\n",
|
| 761 |
"cols = process_vector(s3, folder = folder, file = 'SB535DACgdb_F_2022.gdb', file_name = f\"{name}.parquet\")\n",
|
| 762 |
"convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 763 |
+
"# convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n"
|
| 764 |
]
|
| 765 |
},
|
| 766 |
{
|
|
|
|
| 784 |
"\n",
|
| 785 |
"folder = 'Progress_data_new_protection/Priority_populations'\n",
|
| 786 |
"name = 'CalEnviroScreen4'\n",
|
| 787 |
+
"# unzip(s3, folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
|
| 788 |
"\n",
|
| 789 |
"gdf = (con.read_geo('Priority Populations 4.0 Combined Layer.gdb')\n",
|
| 790 |
" .mutate(id=ibis.row_number().over()) #making a unique id \n",
|
| 791 |
" ).execute().set_crs('EPSG:3857')\n",
|
| 792 |
"\n",
|
| 793 |
+
"cols = process_vector(s3, folder = folder, file = 'Priority Populations 4.0 Combined Layer.gdb',\n",
|
| 794 |
" file_name = f\"{name}.parquet\", gdf = gdf)\n",
|
| 795 |
"\n",
|
| 796 |
+
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 797 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n"
|
| 798 |
]
|
| 799 |
},
|
| 800 |
{
|
|
|
|
| 873 |
"set_secrets(con)\n",
|
| 874 |
"\n",
|
| 875 |
"# file = 'ca-30x30-base.parquet'\n",
|
| 876 |
+
"folder = \"CA_Nature/2024/Preprocessing\"\n",
|
| 877 |
"name = 'ca-30x30-base'\n",
|
| 878 |
+
"download(s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 879 |
"\n",
|
| 880 |
"# gdf = gpd.read_parquet(f\"{name}.parquet\")\n",
|
| 881 |
+
"cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", crs=\"EPSG:4326\")\n",
|
| 882 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, \n",
|
| 883 |
+
" zoom = 8)\n"
|
| 884 |
]
|
| 885 |
},
|
| 886 |
{
|
|
|
|
| 901 |
"con = ibis.duckdb.connect('cpad',extensions = [\"spatial\", \"h3\"])\n",
|
| 902 |
"set_secrets(con)\n",
|
| 903 |
"\n",
|
| 904 |
+
"folder = 'CPAD'\n",
|
| 905 |
"name = 'cced_2024b_release'\n",
|
| 906 |
"\n",
|
| 907 |
"# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
|
| 908 |
"# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
|
| 909 |
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 910 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
|
| 911 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols, zoom = 8)\n",
|
| 912 |
"\n",
|
| 913 |
"name = 'cpad_2024b_release'\n",
|
| 914 |
"# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
|
| 915 |
"# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
|
| 916 |
"# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
|
| 917 |
"cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
|
| 918 |
+
"convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols, zoom = 8)"
|
| 919 |
]
|
| 920 |
}
|
| 921 |
],
|
preprocess/h3_utils.py
CHANGED
|
@@ -1,14 +1,63 @@
|
|
| 1 |
from utils import *
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
-
Computes hexes
|
| 12 |
"""
|
| 13 |
con.raw_sql(f'''
|
| 14 |
CREATE OR REPLACE TEMP TABLE t2 AS
|
|
@@ -17,161 +66,14 @@ def compute_h3(con, name, cols, zoom):
|
|
| 17 |
FROM {name}
|
| 18 |
)
|
| 19 |
SELECT {cols},
|
| 20 |
-
h3_polygon_wkt_to_cells_string(UNNEST(geom).geom, {zoom}) AS h{zoom}
|
| 21 |
FROM t1
|
| 22 |
''')
|
| 23 |
-
|
| 24 |
-
def check_size(con, name, zoom, sample_size):
|
| 25 |
-
"""
|
| 26 |
-
Estimating size of geoms to decide if we need to process in chunks
|
| 27 |
-
"""
|
| 28 |
-
query = f"""
|
| 29 |
-
SELECT
|
| 30 |
-
avg(len(h3_polygon_wkt_to_cells_string(ST_AsText(geom), {zoom}))::DOUBLE) AS avg_h3_len,
|
| 31 |
-
max(len(h3_polygon_wkt_to_cells_string(ST_AsText(geom), {zoom}))) AS max_h3_len,
|
| 32 |
-
count(*) AS total_rows
|
| 33 |
-
FROM {name}
|
| 34 |
-
USING SAMPLE {sample_size}
|
| 35 |
-
"""
|
| 36 |
-
stats = con.sql(query).execute()
|
| 37 |
-
avg_len = stats.iloc[0]['avg_h3_len']
|
| 38 |
-
max_len = stats.iloc[0]['max_h3_len']
|
| 39 |
-
total_rows = con.table(name).count().execute()
|
| 40 |
-
|
| 41 |
-
est_total_h3 = avg_len * total_rows
|
| 42 |
-
|
| 43 |
-
print(f"Estimated total H3 cells: {est_total_h3:,.0f}")
|
| 44 |
-
print(f"Max H3 cells in one geometry: {max_len:,}")
|
| 45 |
-
|
| 46 |
-
return est_total_h3, max_len
|
| 47 |
-
|
| 48 |
-
def chunk_large_geom(con, s3, bucket, path, name, zoom, big_n, batch_limit):
|
| 49 |
-
"""
|
| 50 |
-
Individually processing large geoms (different from processing "chunks")
|
| 51 |
-
"""
|
| 52 |
-
offset = 0
|
| 53 |
-
i = 0
|
| 54 |
-
while True:
|
| 55 |
-
relative_key = f"{path}/hex/zoom{zoom}/{name}_large_{i:03d}.parquet"
|
| 56 |
-
print(f"🟠 Checking large geometry batch {i} → {relative_key}")
|
| 57 |
-
|
| 58 |
-
if exists_on_s3(s3, folder="", file=relative_key): # we pass relative_key as `file`
|
| 59 |
-
print(f"⏩ Skipping existing large batch: {relative_key}")
|
| 60 |
-
offset += batch_limit
|
| 61 |
-
i += 1
|
| 62 |
-
continue
|
| 63 |
|
| 64 |
-
print(f"📝 Writing large geometry batch {i} → {relative_key}")
|
| 65 |
-
q = con.sql(f'''
|
| 66 |
-
SELECT *, UNNEST(h{zoom}) AS h{zoom}
|
| 67 |
-
FROM t2
|
| 68 |
-
WHERE len(h{zoom}) > {big_n}
|
| 69 |
-
LIMIT {batch_limit} OFFSET {offset}
|
| 70 |
-
''')
|
| 71 |
-
|
| 72 |
-
q.to_parquet(f"s3://{bucket}/{relative_key}")
|
| 73 |
-
|
| 74 |
-
if q.count().execute() == 0:
|
| 75 |
-
break
|
| 76 |
-
|
| 77 |
-
offset += batch_limit
|
| 78 |
-
i += 1
|
| 79 |
-
|
| 80 |
-
return i
|
| 81 |
-
|
| 82 |
-
def join_large_geoms(con, s3, bucket, path, name, zoom):
|
| 83 |
-
"""
|
| 84 |
-
If we had to process large geoms individually, join those datasets after conversion.
|
| 85 |
-
"""
|
| 86 |
-
# check if any large files exist before trying to join
|
| 87 |
-
test_key = f"{path}/hex/zoom{zoom}/{name}_large_000.parquet"
|
| 88 |
|
| 89 |
-
|
| 90 |
-
print("✅ No large geometry chunks to join.")
|
| 91 |
-
return
|
| 92 |
-
|
| 93 |
-
# join if it exists
|
| 94 |
con.raw_sql(f'''
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
TO 's3://{bucket}/{path}/hex/zoom{zoom}/{name}_large.parquet'
|
| 99 |
-
(FORMAT PARQUET)
|
| 100 |
''')
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
def chunk_geom(con, s3, bucket, path, name, zoom, limit, batch_limit, big_n):
|
| 104 |
-
"""
|
| 105 |
-
Processing files in chunks.
|
| 106 |
-
"""
|
| 107 |
-
offset = 0
|
| 108 |
-
i = 0
|
| 109 |
-
|
| 110 |
-
while True:
|
| 111 |
-
chunk_path = f"{path}/hex/zoom{zoom}/{name}_chunk{i:03d}.parquet"
|
| 112 |
-
|
| 113 |
-
if exists_on_s3(s3, folder="", file=chunk_path): # relative path passed as file
|
| 114 |
-
print(f"⏩ Skipping existing chunk: {chunk_path}")
|
| 115 |
-
offset += limit
|
| 116 |
-
i += 1
|
| 117 |
-
continue
|
| 118 |
-
|
| 119 |
-
print(f"📝 Writing chunk {i} → {chunk_path}")
|
| 120 |
-
q = con.sql(f'''
|
| 121 |
-
SELECT *, UNNEST(h{zoom}) AS h{zoom}
|
| 122 |
-
FROM t2
|
| 123 |
-
WHERE len(h{zoom}) <= {big_n}
|
| 124 |
-
LIMIT {limit} OFFSET {offset}
|
| 125 |
-
''')
|
| 126 |
-
q.to_parquet(f"s3://{bucket}/{chunk_path}")
|
| 127 |
-
if q.count().execute() == 0:
|
| 128 |
-
break
|
| 129 |
-
offset += limit
|
| 130 |
-
i += 1
|
| 131 |
-
|
| 132 |
-
# process large geometries using same threshold and limit
|
| 133 |
-
chunk_large_geom(con, s3, bucket, path, name, zoom, big_n, batch_limit)
|
| 134 |
-
join_large_geoms(con, s3, bucket, path, name, zoom)
|
| 135 |
-
return i
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
def join_chunked(con, bucket, path, name, zoom):
|
| 140 |
-
"""
|
| 141 |
-
If we had to chunk the data, join those datasets after conversion.
|
| 142 |
-
"""
|
| 143 |
-
con.raw_sql(f'''
|
| 144 |
-
COPY (
|
| 145 |
-
SELECT * FROM read_parquet('s3://{bucket}/{path}/hex/zoom{zoom}/{name}_chunk*.parquet')
|
| 146 |
-
)
|
| 147 |
-
TO 's3://{bucket}/{path}/hex/zoom{zoom}/{name}.parquet'
|
| 148 |
-
(FORMAT PARQUET)
|
| 149 |
-
''')
|
| 150 |
-
|
| 151 |
-
def convert_h3(con, s3, folder, file, cols, zoom=default_zoom, limit=chunk_n, batch_limit = batch_n, big_n=big_n, max_h3_n = max_h3_n):
|
| 152 |
-
"""
|
| 153 |
-
Driver function to convert geometries to h3
|
| 154 |
-
"""
|
| 155 |
-
cols = ", ".join(cols) if isinstance(cols, list) else cols
|
| 156 |
-
bucket, path = info(folder, file)
|
| 157 |
-
path, file = os.path.split(path)
|
| 158 |
-
name, ext = os.path.splitext(file)
|
| 159 |
-
name = name.replace('-', '')
|
| 160 |
-
|
| 161 |
-
print(f"Processing: {name}")
|
| 162 |
-
con.read_parquet(f"s3://{bucket}/{path}/{file}", table_name=name)
|
| 163 |
-
|
| 164 |
-
# Decide to chunk or not
|
| 165 |
-
est_total, max_per_geom = check_size(con, name, zoom, sample_size=100)
|
| 166 |
-
if est_total > max_h3_n or max_per_geom > big_n:
|
| 167 |
-
print("Chunking due to estimated size")
|
| 168 |
-
compute_h3(con, name, cols, zoom)
|
| 169 |
-
chunk_geom(con, s3, bucket, path, name, zoom, limit, batch_limit, big_n)
|
| 170 |
-
join_chunked(con, bucket, path, name, zoom)
|
| 171 |
-
else:
|
| 172 |
-
print("Writing single output")
|
| 173 |
-
compute_h3(con, name, cols, zoom)
|
| 174 |
-
con.sql(f'''
|
| 175 |
-
SELECT *, UNNEST(h{zoom}) AS h{zoom}
|
| 176 |
-
FROM t2
|
| 177 |
-
''').to_parquet(f"s3://{bucket}/{path}/hex/zoom{zoom}/{name}.parquet")
|
|
|
|
| 1 |
from utils import *
|
| 2 |
+
import re
|
| 3 |
|
| 4 |
+
def convert_h3(con, s3, folder, file, cols, zoom):
|
| 5 |
+
"""
|
| 6 |
+
Driver function to convert geometries to h3.
|
| 7 |
+
If no zoom levels exist -> compute from geometry at target zoom.
|
| 8 |
+
If lower zoom exists -> compute children from max available until target zoom.
|
| 9 |
+
"""
|
| 10 |
+
cols = ", ".join(cols) if isinstance(cols, list) else cols
|
| 11 |
+
bucket, path = info(folder, file)
|
| 12 |
+
path, file = os.path.split(path)
|
| 13 |
+
name, ext = os.path.splitext(file)
|
| 14 |
+
name = name.replace('-', '')
|
| 15 |
+
print(f"Processing: {name}")
|
| 16 |
|
| 17 |
+
hex_paths = s3.list_objects(bucket, prefix=f"{path}/hex/", recursive=True)
|
| 18 |
+
zooms = []
|
| 19 |
+
# check what zooms exist
|
| 20 |
+
for obj in hex_paths:
|
| 21 |
+
match = re.search(r"/zoom(\d{1,2})/", obj.object_name)
|
| 22 |
+
if match:
|
| 23 |
+
zooms.append(int(match.group(1)))
|
| 24 |
+
|
| 25 |
+
if not zooms: # if no h3 files exist
|
| 26 |
+
print(f'No h3 files exists, computing {zoom} from geometry.')
|
| 27 |
+
con.read_parquet(f"s3://{bucket}/{path}/{file}", table_name=name)
|
| 28 |
+
h3_from_geom(con, name, cols, zoom)
|
| 29 |
+
con.sql(f'''
|
| 30 |
+
SELECT {cols}, UNNEST(h{zoom}) AS h{zoom}
|
| 31 |
+
FROM t2
|
| 32 |
+
''').to_parquet(f"s3://{bucket}/{path}/hex/zoom{zoom}/{name}.parquet")
|
| 33 |
+
|
| 34 |
+
else:
|
| 35 |
+
current_zoom = max(zooms)
|
| 36 |
+
|
| 37 |
+
if zoom in zooms:
|
| 38 |
+
print(f'Zoom {zoom} already exists!')
|
| 39 |
+
return
|
| 40 |
+
|
| 41 |
+
elif current_zoom < zoom: #compute child of most refined zoom level
|
| 42 |
+
print(f'Reading zoom {current_zoom}')
|
| 43 |
+
con.read_parquet(
|
| 44 |
+
f"s3://{bucket}/{path}/hex/zoom{current_zoom}/{name}.parquet",
|
| 45 |
+
table_name=f"h3_h{current_zoom}"
|
| 46 |
+
)
|
| 47 |
+
print(f'Computing {zoom} from {current_zoom}')
|
| 48 |
+
|
| 49 |
+
for z in range(current_zoom + 1, zoom + 1):
|
| 50 |
+
print(f'Current zoom {z}')
|
| 51 |
+
h3_from_parent(con, z)
|
| 52 |
+
con.sql(f'''
|
| 53 |
+
SELECT *, UNNEST(h3_cell_to_children(h{z-1}, {z})) AS h{z}
|
| 54 |
+
FROM h3_h{z-1}
|
| 55 |
+
''').to_parquet(f"s3://{bucket}/{path}/hex/zoom{z}/{name}.parquet")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def h3_from_geom(con, name, cols, zoom):
|
| 59 |
"""
|
| 60 |
+
Computes hexes directly from geometry.
|
| 61 |
"""
|
| 62 |
con.raw_sql(f'''
|
| 63 |
CREATE OR REPLACE TEMP TABLE t2 AS
|
|
|
|
| 66 |
FROM {name}
|
| 67 |
)
|
| 68 |
SELECT {cols},
|
| 69 |
+
h3_polygon_wkt_to_cells_string(ST_Force2D(UNNEST(geom).geom), {zoom}) AS h{zoom}
|
| 70 |
FROM t1
|
| 71 |
''')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
def h3_from_parent(con, zoom):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
con.raw_sql(f'''
|
| 76 |
+
CREATE OR REPLACE TEMP TABLE h3_h{zoom} AS
|
| 77 |
+
SELECT *, UNNEST(h3_cell_to_children(h{zoom-1}, {zoom})) AS h{zoom}
|
| 78 |
+
FROM h3_h{zoom-1}
|
|
|
|
|
|
|
| 79 |
''')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess/preprocess.ipynb
CHANGED
|
@@ -10,7 +10,7 @@
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
-
"execution_count":
|
| 14 |
"id": "f7e6298c-d886-432a-a1b7-c3fee914c24f",
|
| 15 |
"metadata": {
|
| 16 |
"editable": true,
|
|
@@ -48,12 +48,12 @@
|
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"cell_type": "code",
|
| 51 |
-
"execution_count":
|
| 52 |
"id": "63dd33b8-6d3c-4852-9899-6ed5775d19c0",
|
| 53 |
"metadata": {},
|
| 54 |
"outputs": [],
|
| 55 |
"source": [
|
| 56 |
-
"def get_url(folder, file, base_folder = 'CBN
|
| 57 |
" minio = 'https://minio.carlboettiger.info/'\n",
|
| 58 |
" bucket = 'public-ca30x30'\n",
|
| 59 |
" if base_folder is None:\n",
|
|
@@ -80,7 +80,7 @@
|
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"cell_type": "code",
|
| 83 |
-
"execution_count":
|
| 84 |
"id": "13214bbe-3a74-4247-981f-5a6eb6c486f5",
|
| 85 |
"metadata": {},
|
| 86 |
"outputs": [],
|
|
@@ -90,7 +90,7 @@
|
|
| 90 |
"# ca_raw_parquet = 'ca_areas.parquet'\n",
|
| 91 |
"\n",
|
| 92 |
"# Boundary of CA, used to computed 'non-conserved' areas\n",
|
| 93 |
-
"ca_boundary_parquet = get_url('Preprocessing','ca_boundary.parquet',base_folder = None)\n",
|
| 94 |
"\n",
|
| 95 |
"# newly protected areas \n",
|
| 96 |
"newly_protected = get_url('Progress_data_new_protection/Newly_counted_lands','newly_counted_lands_2024.parquet')\n",
|
|
@@ -167,43 +167,10 @@
|
|
| 167 |
},
|
| 168 |
{
|
| 169 |
"cell_type": "code",
|
| 170 |
-
"execution_count":
|
| 171 |
"id": "0f9666d1-7c2b-45af-9399-e4189bba34f5",
|
| 172 |
"metadata": {},
|
| 173 |
-
"outputs": [
|
| 174 |
-
{
|
| 175 |
-
"data": {
|
| 176 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 177 |
-
"model_id": "52ef18913f17417299860d91e36e9dbd",
|
| 178 |
-
"version_major": 2,
|
| 179 |
-
"version_minor": 0
|
| 180 |
-
},
|
| 181 |
-
"text/plain": [
|
| 182 |
-
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
|
| 183 |
-
]
|
| 184 |
-
},
|
| 185 |
-
"metadata": {},
|
| 186 |
-
"output_type": "display_data"
|
| 187 |
-
},
|
| 188 |
-
{
|
| 189 |
-
"name": "stdout",
|
| 190 |
-
"output_type": "stream",
|
| 191 |
-
"text": [
|
| 192 |
-
"CPU times: user 4min 28s, sys: 6.1 s, total: 4min 34s\n",
|
| 193 |
-
"Wall time: 2min 18s\n"
|
| 194 |
-
]
|
| 195 |
-
},
|
| 196 |
-
{
|
| 197 |
-
"data": {
|
| 198 |
-
"text/plain": [
|
| 199 |
-
"<minio.helpers.ObjectWriteResult at 0x7ff0943c7710>"
|
| 200 |
-
]
|
| 201 |
-
},
|
| 202 |
-
"execution_count": 7,
|
| 203 |
-
"metadata": {},
|
| 204 |
-
"output_type": "execute_result"
|
| 205 |
-
}
|
| 206 |
-
],
|
| 207 |
"source": [
|
| 208 |
"%%time \n",
|
| 209 |
"# match CA Nature schema \n",
|
|
@@ -241,7 +208,7 @@
|
|
| 241 |
},
|
| 242 |
{
|
| 243 |
"cell_type": "code",
|
| 244 |
-
"execution_count":
|
| 245 |
"id": "a3d4f189-1563-4868-9f1f-64d67569df27",
|
| 246 |
"metadata": {},
|
| 247 |
"outputs": [],
|
|
@@ -298,7 +265,7 @@
|
|
| 298 |
},
|
| 299 |
{
|
| 300 |
"cell_type": "code",
|
| 301 |
-
"execution_count":
|
| 302 |
"id": "a59c976b-3c36-40f9-a15b-cefcd155c647",
|
| 303 |
"metadata": {},
|
| 304 |
"outputs": [],
|
|
@@ -344,58 +311,10 @@
|
|
| 344 |
},
|
| 345 |
{
|
| 346 |
"cell_type": "code",
|
| 347 |
-
"execution_count":
|
| 348 |
"id": "4d6177e2-8ece-4eb9-acc2-5fb5c5beb8bb",
|
| 349 |
"metadata": {},
|
| 350 |
-
"outputs": [
|
| 351 |
-
{
|
| 352 |
-
"data": {
|
| 353 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 354 |
-
"model_id": "09f24f1359a84ae2a4b69360cc8e852b",
|
| 355 |
-
"version_major": 2,
|
| 356 |
-
"version_minor": 0
|
| 357 |
-
},
|
| 358 |
-
"text/plain": [
|
| 359 |
-
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
|
| 360 |
-
]
|
| 361 |
-
},
|
| 362 |
-
"metadata": {},
|
| 363 |
-
"output_type": "display_data"
|
| 364 |
-
},
|
| 365 |
-
{
|
| 366 |
-
"data": {
|
| 367 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 368 |
-
"model_id": "c10ce980d24e45b6bad9b8a70c176f2c",
|
| 369 |
-
"version_major": 2,
|
| 370 |
-
"version_minor": 0
|
| 371 |
-
},
|
| 372 |
-
"text/plain": [
|
| 373 |
-
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
|
| 374 |
-
]
|
| 375 |
-
},
|
| 376 |
-
"metadata": {},
|
| 377 |
-
"output_type": "display_data"
|
| 378 |
-
},
|
| 379 |
-
{
|
| 380 |
-
"name": "stderr",
|
| 381 |
-
"output_type": "stream",
|
| 382 |
-
"text": [
|
| 383 |
-
"/opt/conda/lib/python3.12/site-packages/ibis/common/deferred.py:408: FutureWarning: `Value.case` is deprecated as of v10.0.0; use value.cases() or ibis.cases()\n",
|
| 384 |
-
" return func(*args, **kwargs)\n"
|
| 385 |
-
]
|
| 386 |
-
},
|
| 387 |
-
{
|
| 388 |
-
"ename": "NameError",
|
| 389 |
-
"evalue": "name 'non_conserved' is not defined",
|
| 390 |
-
"output_type": "error",
|
| 391 |
-
"traceback": [
|
| 392 |
-
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 393 |
-
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
| 394 |
-
"\u001b[36mFile \u001b[39m\u001b[32m<timed exec>:50\u001b[39m\n",
|
| 395 |
-
"\u001b[31mNameError\u001b[39m: name 'non_conserved' is not defined"
|
| 396 |
-
]
|
| 397 |
-
}
|
| 398 |
-
],
|
| 399 |
"source": [
|
| 400 |
"%%time \n",
|
| 401 |
"counties = con.read_parquet('../CA_counties.parquet')\n",
|
|
@@ -454,7 +373,7 @@
|
|
| 454 |
"gdf = all_data.execute()\n",
|
| 455 |
"\n",
|
| 456 |
"gdf.set_crs(\"epsg:3310\").to_parquet(ca_base_parquet)\n",
|
| 457 |
-
"s3.fput_object(\"public-ca30x30\", 'Preprocessing/'+ca_base_parquet, ca_base_parquet) "
|
| 458 |
]
|
| 459 |
},
|
| 460 |
{
|
|
@@ -485,7 +404,7 @@
|
|
| 485 |
"\n",
|
| 486 |
"def get_habitat_type(fieldname):\n",
|
| 487 |
" aux_xml_path = 'fveg22_1_processed.tif.aux.xml'\n",
|
| 488 |
-
" s3.fget_object('public-ca30x30','CBN
|
| 489 |
" tree = ET.parse(aux_xml_path)\n",
|
| 490 |
" root = tree.find(\".//GDALRasterAttributeTable\")\n",
|
| 491 |
" field_names = [f.find(\"Name\").text for f in root.findall(\"FieldDefn\")]\n",
|
|
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
+
"execution_count": null,
|
| 14 |
"id": "f7e6298c-d886-432a-a1b7-c3fee914c24f",
|
| 15 |
"metadata": {
|
| 16 |
"editable": true,
|
|
|
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"cell_type": "code",
|
| 51 |
+
"execution_count": null,
|
| 52 |
"id": "63dd33b8-6d3c-4852-9899-6ed5775d19c0",
|
| 53 |
"metadata": {},
|
| 54 |
"outputs": [],
|
| 55 |
"source": [
|
| 56 |
+
"def get_url(folder, file, base_folder = 'CBN'):\n",
|
| 57 |
" minio = 'https://minio.carlboettiger.info/'\n",
|
| 58 |
" bucket = 'public-ca30x30'\n",
|
| 59 |
" if base_folder is None:\n",
|
|
|
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"cell_type": "code",
|
| 83 |
+
"execution_count": null,
|
| 84 |
"id": "13214bbe-3a74-4247-981f-5a6eb6c486f5",
|
| 85 |
"metadata": {},
|
| 86 |
"outputs": [],
|
|
|
|
| 90 |
"# ca_raw_parquet = 'ca_areas.parquet'\n",
|
| 91 |
"\n",
|
| 92 |
"# Boundary of CA, used to computed 'non-conserved' areas\n",
|
| 93 |
+
"ca_boundary_parquet = get_url('CA_Nature/2024/Preprocessing','ca_boundary.parquet',base_folder = None)\n",
|
| 94 |
"\n",
|
| 95 |
"# newly protected areas \n",
|
| 96 |
"newly_protected = get_url('Progress_data_new_protection/Newly_counted_lands','newly_counted_lands_2024.parquet')\n",
|
|
|
|
| 167 |
},
|
| 168 |
{
|
| 169 |
"cell_type": "code",
|
| 170 |
+
"execution_count": null,
|
| 171 |
"id": "0f9666d1-7c2b-45af-9399-e4189bba34f5",
|
| 172 |
"metadata": {},
|
| 173 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
"source": [
|
| 175 |
"%%time \n",
|
| 176 |
"# match CA Nature schema \n",
|
|
|
|
| 208 |
},
|
| 209 |
{
|
| 210 |
"cell_type": "code",
|
| 211 |
+
"execution_count": null,
|
| 212 |
"id": "a3d4f189-1563-4868-9f1f-64d67569df27",
|
| 213 |
"metadata": {},
|
| 214 |
"outputs": [],
|
|
|
|
| 265 |
},
|
| 266 |
{
|
| 267 |
"cell_type": "code",
|
| 268 |
+
"execution_count": null,
|
| 269 |
"id": "a59c976b-3c36-40f9-a15b-cefcd155c647",
|
| 270 |
"metadata": {},
|
| 271 |
"outputs": [],
|
|
|
|
| 311 |
},
|
| 312 |
{
|
| 313 |
"cell_type": "code",
|
| 314 |
+
"execution_count": null,
|
| 315 |
"id": "4d6177e2-8ece-4eb9-acc2-5fb5c5beb8bb",
|
| 316 |
"metadata": {},
|
| 317 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
"source": [
|
| 319 |
"%%time \n",
|
| 320 |
"counties = con.read_parquet('../CA_counties.parquet')\n",
|
|
|
|
| 373 |
"gdf = all_data.execute()\n",
|
| 374 |
"\n",
|
| 375 |
"gdf.set_crs(\"epsg:3310\").to_parquet(ca_base_parquet)\n",
|
| 376 |
+
"s3.fput_object(\"public-ca30x30\", 'CA_Nature/2024/Preprocessing/'+ca_base_parquet, ca_base_parquet) "
|
| 377 |
]
|
| 378 |
},
|
| 379 |
{
|
|
|
|
| 404 |
"\n",
|
| 405 |
"def get_habitat_type(fieldname):\n",
|
| 406 |
" aux_xml_path = 'fveg22_1_processed.tif.aux.xml'\n",
|
| 407 |
+
" s3.fget_object('public-ca30x30','CBN/Habitat/'+aux_xml_path, aux_xml_path)\n",
|
| 408 |
" tree = ET.parse(aux_xml_path)\n",
|
| 409 |
" root = tree.find(\".//GDALRasterAttributeTable\")\n",
|
| 410 |
" field_names = [f.find(\"Name\").text for f in root.findall(\"FieldDefn\")]\n",
|
preprocess/utils.py
CHANGED
|
@@ -15,7 +15,7 @@ from shapely.geometry import shape
|
|
| 15 |
import numpy as np
|
| 16 |
|
| 17 |
|
| 18 |
-
def info(folder, file, bucket = "public-ca30x30", base_folder = 'CBN
|
| 19 |
"""
|
| 20 |
Extract minio path to upload/download data
|
| 21 |
"""
|
|
@@ -77,22 +77,10 @@ def process_raster(s3, folder, file, file_name = None):
|
|
| 77 |
"""
|
| 78 |
if file_name:
|
| 79 |
file = file_name
|
| 80 |
-
# output_file = reproject_raster(file)
|
| 81 |
-
# upload(s3, folder, output_file)
|
| 82 |
-
# output_cog_file = make_cog(output_file)
|
| 83 |
-
# upload(s3, folder, output_cog_file)
|
| 84 |
-
# output_vector, cols = make_vector(output_file)
|
| 85 |
-
# upload(s3, folder, output_vector)
|
| 86 |
-
|
| 87 |
name, ext = os.path.splitext(file)
|
| 88 |
output_file = f"{name}_processed{ext}"
|
| 89 |
-
|
| 90 |
output_cog_file = f"{name}_processed_COG{ext}"
|
| 91 |
-
|
| 92 |
output_vector_file = f"{name}_processed.parquet"
|
| 93 |
-
print(output_file)
|
| 94 |
-
print(output_cog_file)
|
| 95 |
-
print(output_vector_file)
|
| 96 |
# Reproject raster
|
| 97 |
if not exists_on_s3(s3, folder, output_file):
|
| 98 |
output_file = reproject_raster(file)
|
|
@@ -183,7 +171,6 @@ def make_vector(input_file, crs="EPSG:4326"):
|
|
| 183 |
gdf.to_crs(crs, inplace=True)
|
| 184 |
|
| 185 |
gdf.to_parquet(output_file)
|
| 186 |
-
print(gdf)
|
| 187 |
return output_file, gdf.drop('geom',axis = 1).columns.to_list()
|
| 188 |
|
| 189 |
def filter_raster(s3, folder, file, percentile):
|
|
@@ -226,9 +213,6 @@ def exists_on_s3(s3, folder, file):
|
|
| 226 |
Check if a file exists on S3
|
| 227 |
"""
|
| 228 |
bucket, path = info(folder, file)
|
| 229 |
-
print(bucket)
|
| 230 |
-
print(path)
|
| 231 |
-
|
| 232 |
try:
|
| 233 |
s3.stat_object(bucket, path)
|
| 234 |
return True
|
|
|
|
| 15 |
import numpy as np
|
| 16 |
|
| 17 |
|
| 18 |
+
def info(folder, file, bucket = "public-ca30x30", base_folder = 'CBN/'):
|
| 19 |
"""
|
| 20 |
Extract minio path to upload/download data
|
| 21 |
"""
|
|
|
|
| 77 |
"""
|
| 78 |
if file_name:
|
| 79 |
file = file_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
name, ext = os.path.splitext(file)
|
| 81 |
output_file = f"{name}_processed{ext}"
|
|
|
|
| 82 |
output_cog_file = f"{name}_processed_COG{ext}"
|
|
|
|
| 83 |
output_vector_file = f"{name}_processed.parquet"
|
|
|
|
|
|
|
|
|
|
| 84 |
# Reproject raster
|
| 85 |
if not exists_on_s3(s3, folder, output_file):
|
| 86 |
output_file = reproject_raster(file)
|
|
|
|
| 171 |
gdf.to_crs(crs, inplace=True)
|
| 172 |
|
| 173 |
gdf.to_parquet(output_file)
|
|
|
|
| 174 |
return output_file, gdf.drop('geom',axis = 1).columns.to_list()
|
| 175 |
|
| 176 |
def filter_raster(s3, folder, file, percentile):
|
|
|
|
| 213 |
Check if a file exists on S3
|
| 214 |
"""
|
| 215 |
bucket, path = info(folder, file)
|
|
|
|
|
|
|
|
|
|
| 216 |
try:
|
| 217 |
s3.stat_object(bucket, path)
|
| 218 |
return True
|