cassiebuhler commited on
Commit
0d2d459
·
1 Parent(s): db54b60

wip; tinkering hyperparameters for large geoms

Browse files
Files changed (3) hide show
  1. preprocess/CBN-data.ipynb +124 -170
  2. preprocess/h3_utils.py +42 -40
  3. preprocess/utils.py +72 -33
preprocess/CBN-data.ipynb CHANGED
@@ -63,11 +63,11 @@
63
  "folder = 'Counties'\n",
64
  "name = 'CA_counties'\n",
65
  "\n",
66
- "# unzip(folder = folder, file = '30x30_Counties.zip')\n",
67
- "# process_vector(folder = folder, file = f\"{name}.shp\")\n",
68
- "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
69
  "\n",
70
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= \"COUNTY_NAM\")"
71
  ]
72
  },
73
  {
@@ -91,9 +91,9 @@
91
  "\n",
92
  "folder = 'Climate_zones'\n",
93
  "name = 'climate_zones_10'\n",
94
- "# download(folder = folder, file = 'clusters_10.tif')\n",
95
- "# process_raster(s3, folder = folder, file = 'clusters_10.tif', file_name = f\"{name}.tif\")\n",
96
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
97
  ]
98
  },
99
  {
@@ -118,10 +118,10 @@
118
  "folder = 'Ecoregion'\n",
119
  "name = 'ACE_ecoregions'\n",
120
  "\n",
121
- "unzip(folder = folder, file = '30x30_Ecoregions.zip')\n",
122
- "process_vector(folder = folder, file = f\"{name}.shp\")\n",
123
  "\n",
124
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= 'CA_Ecoregi')"
125
  ]
126
  },
127
  {
@@ -148,7 +148,7 @@
148
  "outputs": [],
149
  "source": [
150
  "# download(folder = 'Habitat', file = 'CWHR13_2022.tif')\n",
151
- "# process_raster(s3, folder = 'Habitat', file = 'CWHR13_2022.tif')"
152
  ]
153
  },
154
  {
@@ -165,20 +165,20 @@
165
  "folder = 'Habitat'\n",
166
  "name = 'fveg22_1'\n",
167
  "\n",
168
- "# unzip(folder = folder, file = 'fveg221gdb.zip')\n",
169
  "\n",
170
- "# command = [\n",
171
- "# \"gdalwarp\",\n",
172
- "# \"-of\", \"GTiff\",\n",
173
- "# 'fveg22_1.gdb',\n",
174
- "# 'fveg22_1.tif' \n",
175
- "# ]\n",
176
  "\n",
177
- "# subprocess.run(command, check=True)\n",
178
- "process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
179
  "upload(folder = folder, file = f'{name}_processed.tif.aux.xml')\n",
180
  "\n",
181
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
182
  ]
183
  },
184
  {
@@ -211,11 +211,11 @@
211
  "download(folder = folder, file = 'Terrestrial_Biodiversity_Summary_-_ACE_[ds2739].geojson',\n",
212
  " file_name = f\"{name}.geojson\")\n",
213
  "\n",
214
- "process_vector(folder = folder, file = f\"{name}.geojson\")\n",
215
- "# convert_pmtiles(folder = folder, file = f\"{name}.geojson\")\n",
216
  "\n",
217
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= \"OBJECTID\")\n",
218
- "# gdf = gpd.read_parquet(f\"{name}.parquet\")\n"
219
  ]
220
  },
221
  {
@@ -244,8 +244,8 @@
244
  " 'County', 'Shape__Area', 'Shape__Length', 'geometry']\n",
245
  " cols.append(col) #select only the cols we want + the new col. \n",
246
  " rank_df = gdf[gdf[col]==5][cols]# filter ranks = 5\n",
247
- " process_vector(folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = rank_df)\n",
248
- " convert_pmtiles(folder ='ACE_biodiversity/'+name, file = name+'.parquet')\n"
249
  ]
250
  },
251
  {
@@ -283,14 +283,32 @@
283
  " percentile = 0.95\n",
284
  " threshold = gdf[col].quantile(percentile)\n",
285
  " ace = gdf[gdf[col]>=threshold][cols]\n",
286
- " process_vector(folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = ace)\n",
287
- " convert_pmtiles(folder ='ACE_biodiversity/'+name, file = name+'.parquet')\n",
288
  "\n",
289
  "\n",
290
  "# calculate 80% percentile, filter to those >= threshold. \n",
291
  "# subset to calculate acres within each network, % of feature conserved and % of network "
292
  ]
293
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  {
295
  "cell_type": "markdown",
296
  "id": "6991222f-7d24-4f10-9ee0-db20513405d6",
@@ -321,9 +339,9 @@
321
  "folder = 'Biodiversity_unique/Plant_richness'\n",
322
  "name = 'species_D'\n",
323
  "\n",
324
- "download(folder = folder, file = f\"{name}.tif\")\n",
325
- "filter_raster(folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
326
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
327
  ]
328
  },
329
  {
@@ -348,9 +366,9 @@
348
  "folder = 'Biodiversity_unique/Rarityweighted_endemic_plant_richness'\n",
349
  "name = 'endemicspecies_E'\n",
350
  "\n",
351
- "download(folder = folder, file = f\"{name}.tif\")\n",
352
- "filter_raster(folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
353
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
354
  ]
355
  },
356
  {
@@ -392,8 +410,8 @@
392
  "folder = 'Connectivity_resilience/Resilient_connected_network_allcategories'\n",
393
  "name = 'rcn_wIntactBioCat_caOnly_2020-10-27'\n",
394
  "\n",
395
- "process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
396
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
397
  ]
398
  },
399
  {
@@ -468,20 +486,21 @@
468
  "outputs": [],
469
  "source": [
470
  "%%time \n",
 
 
471
  "\n",
472
  "folder = 'Freshwater_resources/Wetlands'\n",
473
  "name = 'CA_wetlands'\n",
474
  "\n",
475
  "# only pick a subset \n",
476
- "unzip(folder = folder, file = 'CA_geodatabase_wetlands.zip')\n",
477
  "gdf = gpd.read_file('CA_geodatabase_wetlands.gdb')\n",
478
  "wetlands = ['Freshwater Emergent Wetland', 'Freshwater Forested/Shrub Wetland', 'Estuarine and Marine Wetland']\n",
479
  "gdf = gdf[gdf['WETLAND_TYPE'].isin(wetlands)]\n",
480
  "\n",
481
- "process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
482
- "# convert_pmtiles(folder =folder, file = f\"{name}.parquet\")\n",
483
- "geom_to_h3(con, folder = folder, file = f\"{name}.parquet\", cols= ['ATTRIBUTE','WETLAND_TYPE','NWI_ID'])\n",
484
- "\n"
485
  ]
486
  },
487
  {
@@ -580,34 +599,33 @@
580
  "outputs": [],
581
  "source": [
582
  "%%time \n",
 
 
583
  "\n",
584
  "folder = 'NBS_agriculture/Farmland'\n",
585
- "unzip(folder = folder, file = 'Important_Farmland_2018.zip')\n",
586
  "\n",
587
  "folder = 'NBS_agriculture/Farmland_all'\n",
588
  "name = 'Important_Farmland_2018'\n",
589
- "process_vector(folder = folder, file = f\"{name}.gdb\")\n",
590
- "# convert_pmtiles(folder = folder, file =f\"{name}.parquet\")\n",
591
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= ['county_nam','polygon_ty'])\n",
592
  "\n",
593
  "# only pick a subset \n",
594
  "folder = 'NBS_agriculture/Farmland_all/Farmland'\n",
595
  "name = 'Farmland_2018'\n",
596
- "# gdf = gpd.read_file('Important_Farmland_2018.gdb')\n",
597
- "# farmland_type = ['P','S','L','U'] # prime, statewide importance, local importance, unique\n",
598
- "# gdf_farmland = gdf[gdf['polygon_ty'].isin(farmland_type)]\n",
599
- "# process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf_farmland)\n",
600
- "# convert_pmtiles(folder = folder, file =f\"{name}.parquet\")\n",
601
- "\n",
602
- "\n",
603
  "\n",
604
  "# grazing lands \n",
605
  "folder = 'NBS_agriculture/Farmland_all/Lands_suitable_grazing'\n",
606
  "name = 'Grazing_land_2018'\n",
607
- "\n",
608
- "# gdf_grazing = gdf[gdf['polygon_ty'] == 'G']\n",
609
- "# process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf_grazing)\n",
610
- "# convert_pmtiles(folder = folder, file =f\"{name}.parquet\")\n"
611
  ]
612
  },
613
  {
@@ -644,14 +662,6 @@
644
  "Only YEAR >= 2014. "
645
  ]
646
  },
647
- {
648
- "cell_type": "code",
649
- "execution_count": null,
650
- "id": "425f9149-d8ac-437a-9572-301bd1b1bec8",
651
- "metadata": {},
652
- "outputs": [],
653
- "source": []
654
- },
655
  {
656
  "cell_type": "code",
657
  "execution_count": null,
@@ -663,15 +673,15 @@
663
  "folder = 'Climate_risks/Historical_fire_perimeters'\n",
664
  "name = 'calfire_2023'\n",
665
  "\n",
666
- "unzip(folder = folder, file = 'fire23-1gdb.zip')\n",
667
  "gdf = gpd.read_file('fire23_1.gdb')\n",
668
- "gdf = gdf[~gdf['YEAR_'].isna()]\n",
669
- "gdf['YEAR_'] = gdf['YEAR_'].astype('int64')\n",
670
  "# gdf = gdf[gdf['YEAR_']>=2014]\n",
671
- "process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
672
- "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
673
  "\n",
674
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= ['INC_NUM','FIRE_NAME','YEAR_'])"
675
  ]
676
  },
677
  {
@@ -720,7 +730,7 @@
720
  "Do seperately for both climate models - CNRM and MIROC.\n",
721
  "'''\n",
722
  "\n",
723
- "unzip(folder = 'Climate_risks/Mid-century_habitat_climate_exposure', file = 'Midcentury_habitat_climate_exposure.zip')\n",
724
  "\n",
725
  "# still need to do "
726
  ]
@@ -752,10 +762,10 @@
752
  "folder = 'Progress_data_new_protection/Newly_counted_lands'\n",
753
  "name = 'newly_counted_lands_2024'\n",
754
  "\n",
755
- "unzip(folder = folder, file = f\"{name}.shp.zip\")\n",
756
- "process_vector(folder = folder, file = f\"{name}.shp\")\n",
757
- "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
758
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= ['ORIG_FID'])\n"
759
  ]
760
  },
761
  {
@@ -777,10 +787,10 @@
777
  "folder = 'Progress_data_new_protection/DAC'\n",
778
  "name = 'DAC_2022'\n",
779
  "\n",
780
- "unzip(folder = folder, file = 'sb535dacgdbf2022gdb.zip')\n",
781
- "process_vector(folder = folder, file = 'SB535DACgdb_F_2022.gdb', file_name = f\"{name}.parquet\")\n",
782
- "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
783
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= ['GEOID'])\n"
784
  ]
785
  },
786
  {
@@ -804,34 +814,17 @@
804
  "\n",
805
  "folder = 'Progress_data_new_protection/Priority_populations'\n",
806
  "name = 'CalEnviroScreen4'\n",
807
- "unzip(folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
808
  "\n",
809
  "gdf = (con.read_geo('Priority Populations 4.0 Combined Layer.gdb')\n",
810
  " .mutate(id=ibis.row_number().over()) #making a unique id \n",
811
  " ).execute().set_crs('EPSG:3857')\n",
812
  "\n",
813
- "process_vector(folder = folder, file = 'Priority Populations 4.0 Combined Layer.gdb',\n",
814
  " file_name = f\"{name}.parquet\", gdf = gdf)\n",
815
  "\n",
816
- "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
817
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= [\"id\"])\n"
818
- ]
819
- },
820
- {
821
- "cell_type": "code",
822
- "execution_count": null,
823
- "id": "e64129da-f369-425f-afcc-bc595a89fb7d",
824
- "metadata": {},
825
- "outputs": [],
826
- "source": [
827
- "file = f\"{name}.parquet\"\n",
828
- "folder = 'Progress_data_new_protection/Priority_populations'\n",
829
- "name = 'CalEnviroScreen4'\n",
830
- "bucket, path = info(folder, file)\n",
831
- "# path, file = os.path.split(path)\n",
832
- "# name, ext = os.path.splitext(file)\n",
833
- "# join_chunked(bucket, path, file)\n",
834
- "con.read_parquet(f\"s3://{bucket}/{folder}/hex/{file}_part_000.parquet\").head(10).execute()"
835
  ]
836
  },
837
  {
@@ -852,12 +845,12 @@
852
  "folder = 'Progress_data_new_protection/Low_income_communities'\n",
853
  "name = 'low_income_CalEnviroScreen4'\n",
854
  "\n",
855
- "unzip(folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
856
  "\n",
857
  "gdf = gpd.read_file('Priority Populations 4.0 Combined Layer.gdb')\n",
858
  "gdf = gdf[gdf['Designatio'] =='Low-income community']\n",
859
- "process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
860
- "convert_pmtiles(folder = folder, file = f\"{name}.parquet\")"
861
  ]
862
  },
863
  {
@@ -878,10 +871,10 @@
878
  "folder = 'Progress_data_new_protection/Land_Status_Zone_Ecoregion_Counties'\n",
879
  "name = 'all_regions_reGAP_county_eco'\n",
880
  "\n",
881
- "unzip(folder = folder, file = 'Land_Status_Zone_Ecoregion_Counties.shp.zip')\n",
882
- "process_vector(folder = folder, file = 'Land_Status_Zone_Ecoregion_Counties.shp',\n",
883
  " file_name = f\"{name}.parquet\")\n",
884
- "convert_pmtiles(folder = folder, file = f\"{name}.parquet\")"
885
  ]
886
  },
887
  {
@@ -892,43 +885,6 @@
892
  "# CA Nature data"
893
  ]
894
  },
895
- {
896
- "cell_type": "code",
897
- "execution_count": null,
898
- "id": "ecc0f168-badd-4e4d-b97b-ee7891afaa4e",
899
- "metadata": {},
900
- "outputs": [],
901
- "source": [
902
- "# def convert_h3_2(con, folder, file, cols, zoom = \"8\"):\n",
903
- "# cols = \", \".join(cols) if isinstance(cols,list) else cols #unpack columns \n",
904
- "# bucket = 'public-ca30x30'\n",
905
- "# name = 'ca-30x30-base'\n",
906
- "# file = 'ca-30x30-base.parquet'\n",
907
- "# folder = \"Preprocessing\"\n",
908
- "# name= name.replace('-','')\n",
909
- "\n",
910
- "\n",
911
- "# # reproject \n",
912
- "# # (con.read_parquet(f\"s3://{bucket}/{file}\")\n",
913
- "# # .mutate(geom = _.geom.convert('epsg:3310','epsg:4326'))\n",
914
- "# # ).to_parquet(f\"s3://{bucket}/hex/{file}\")\n",
915
- "\n",
916
- "# con.read_parquet(f\"s3://{bucket}/{folder}/{file}\", table_name = name)\n",
917
- "\n",
918
- "# con.sql(f'''\n",
919
- "# WITH t2 AS (\n",
920
- "# WITH t1 AS (\n",
921
- "# SELECT {cols}, ST_Dump(geom) AS geom \n",
922
- "# FROM {name}\n",
923
- "# ) \n",
924
- "# SELECT {cols},\n",
925
- "# h3_polygon_wkt_to_cells_string(UNNEST(geom).geom, {zoom}) AS h{zoom}\n",
926
- "# FROM t1\n",
927
- "# )\n",
928
- "# SELECT *, UNNEST(h{zoom}) AS h{zoom} FROM t2\n",
929
- "# ''').to_parquet(f\"s3://{bucket}/{folder}/hex/{file}\")"
930
- ]
931
- },
932
  {
933
  "cell_type": "code",
934
  "execution_count": null,
@@ -946,46 +902,44 @@
946
  "# download(folder = folder, file = f\"{name}.parquet\")\n",
947
  "\n",
948
  "# gdf = gpd.read_parquet(f\"{name}.parquet\")\n",
949
- "process_vector(folder = folder, file = f\"{name}.parquet\")\n",
950
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= [\"id\"])\n",
951
- "\n",
952
- "# convert_h3_2(con, folder = folder, file = f\"{name}.parquet\", cols= [\"id\"])\n",
953
- "\n"
954
  ]
955
  },
956
  {
957
- "cell_type": "code",
958
- "execution_count": null,
959
- "id": "908786ec-2a86-4fb0-a47a-9de364254806",
960
  "metadata": {},
961
- "outputs": [],
962
  "source": [
963
- "# url = f\"s3://public-ca30x30/{folder}/hex/{name}.parquet\"\n",
964
- "# # url = f\"s3://public-ca30x30/{folder}/{name}.parquet\"\n",
965
- "\n",
966
- "# con.read_parquet(url).head(10).execute()\n"
967
  ]
968
  },
969
  {
970
  "cell_type": "code",
971
  "execution_count": null,
972
- "id": "65d98aa3-041f-42d6-8448-6fa8a05f5850",
973
  "metadata": {},
974
  "outputs": [],
975
  "source": [
976
- "# file = 'ca-30x30-base.parquet'\n",
977
- "# folder = \"Preprocessing\"\n",
978
- "# bucket = 'public-ca30x30'\n",
979
- "# con.read_parquet(f\"s3://{bucket}/{folder}/hex/{file}\").head(10).execute()"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
980
  ]
981
- },
982
- {
983
- "cell_type": "code",
984
- "execution_count": null,
985
- "id": "fa960a99-3c79-4e67-becf-a1cb397aa5fb",
986
- "metadata": {},
987
- "outputs": [],
988
- "source": []
989
  }
990
  ],
991
  "metadata": {
 
63
  "folder = 'Counties'\n",
64
  "name = 'CA_counties'\n",
65
  "\n",
66
+ "unzip(s3, folder = folder, file = '30x30_Counties.zip')\n",
67
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
68
+ "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
69
  "\n",
70
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)"
71
  ]
72
  },
73
  {
 
91
  "\n",
92
  "folder = 'Climate_zones'\n",
93
  "name = 'climate_zones_10'\n",
94
+ "download(folder = folder, file = 'clusters_10.tif')\n",
95
+ "cols = process_raster(s3, folder = folder, file = 'clusters_10.tif', file_name = f\"{name}.tif\")\n",
96
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
97
  ]
98
  },
99
  {
 
118
  "folder = 'Ecoregion'\n",
119
  "name = 'ACE_ecoregions'\n",
120
  "\n",
121
+ "unzip(s3, folder = folder, file = '30x30_Ecoregions.zip')\n",
122
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
123
  "\n",
124
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)"
125
  ]
126
  },
127
  {
 
148
  "outputs": [],
149
  "source": [
150
  "# download(folder = 'Habitat', file = 'CWHR13_2022.tif')\n",
151
+ "# cols = process_raster(s3, folder = 'Habitat', file = 'CWHR13_2022.tif')"
152
  ]
153
  },
154
  {
 
165
  "folder = 'Habitat'\n",
166
  "name = 'fveg22_1'\n",
167
  "\n",
168
+ "unzip(s3, folder = folder, file = 'fveg221gdb.zip')\n",
169
  "\n",
170
+ "command = [\n",
171
+ " \"gdalwarp\",\n",
172
+ " \"-of\", \"GTiff\",\n",
173
+ " 'fveg22_1.gdb',\n",
174
+ " 'fveg22_1.tif' \n",
175
+ " ]\n",
176
  "\n",
177
+ "subprocess.run(command, check=True)\n",
178
+ "cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
179
  "upload(folder = folder, file = f'{name}_processed.tif.aux.xml')\n",
180
  "\n",
181
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
182
  ]
183
  },
184
  {
 
211
  "download(folder = folder, file = 'Terrestrial_Biodiversity_Summary_-_ACE_[ds2739].geojson',\n",
212
  " file_name = f\"{name}.geojson\")\n",
213
  "\n",
214
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.geojson\")\n",
215
+ "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.geojson\")\n",
216
  "\n",
217
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n",
218
+ "gdf = gpd.read_parquet(f\"{name}.parquet\")\n"
219
  ]
220
  },
221
  {
 
244
  " 'County', 'Shape__Area', 'Shape__Length', 'geometry']\n",
245
  " cols.append(col) #select only the cols we want + the new col. \n",
246
  " rank_df = gdf[gdf[col]==5][cols]# filter ranks = 5\n",
247
+ " cols = process_vector(s3, folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = rank_df)\n",
248
+ " convert_pmtiles(con, s3, folder ='ACE_biodiversity/'+name, file = name+'.parquet')\n"
249
  ]
250
  },
251
  {
 
283
  " percentile = 0.95\n",
284
  " threshold = gdf[col].quantile(percentile)\n",
285
  " ace = gdf[gdf[col]>=threshold][cols]\n",
286
+ " cols = process_vector(s3, folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = ace)\n",
287
+ " convert_pmtiles(con, s3, folder ='ACE_biodiversity/'+name, file = name+'.parquet')\n",
288
  "\n",
289
  "\n",
290
  "# calculate 80% percentile, filter to those >= threshold. \n",
291
  "# subset to calculate acres within each network, % of feature conserved and % of network "
292
  ]
293
  },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": null,
297
+ "id": "50f9c3bc-8e7e-4bb9-b1c9-9718cf8454a8",
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": [
301
+ "con = ibis.duckdb.connect(extensions = [\"spatial\", \"h3\"])\n",
302
+ "set_secrets(con)\n",
303
+ "\n",
304
+ "folder = 'Climate_risks/Historical_fire_perimeters'\n",
305
+ "name = 'calfire_2023'\n",
306
+ "\n",
307
+ "url = f\"s3://public-ca30x30/CBN-data/{folder}/hex/{name}.parquet\"\n",
308
+ "\n",
309
+ "con.read_parquet(url).head(10).execute()\n"
310
+ ]
311
+ },
312
  {
313
  "cell_type": "markdown",
314
  "id": "6991222f-7d24-4f10-9ee0-db20513405d6",
 
339
  "folder = 'Biodiversity_unique/Plant_richness'\n",
340
  "name = 'species_D'\n",
341
  "\n",
342
+ "# download(s3, folder = folder, file = f\"{name}.tif\")\n",
343
+ "cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
344
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
345
  ]
346
  },
347
  {
 
366
  "folder = 'Biodiversity_unique/Rarityweighted_endemic_plant_richness'\n",
367
  "name = 'endemicspecies_E'\n",
368
  "\n",
369
+ "download(s3, folder = folder, file = f\"{name}.tif\")\n",
370
+ "cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
371
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
372
  ]
373
  },
374
  {
 
410
  "folder = 'Connectivity_resilience/Resilient_connected_network_allcategories'\n",
411
  "name = 'rcn_wIntactBioCat_caOnly_2020-10-27'\n",
412
  "\n",
413
+ "cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
414
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
415
  ]
416
  },
417
  {
 
486
  "outputs": [],
487
  "source": [
488
  "%%time \n",
489
+ "con = ibis.duckdb.connect('wetlands',extensions = [\"spatial\", \"h3\"])\n",
490
+ "set_secrets(con)\n",
491
  "\n",
492
  "folder = 'Freshwater_resources/Wetlands'\n",
493
  "name = 'CA_wetlands'\n",
494
  "\n",
495
  "# only pick a subset \n",
496
+ "unzip(s3, folder = folder, file = 'CA_geodatabase_wetlands.zip')\n",
497
  "gdf = gpd.read_file('CA_geodatabase_wetlands.gdb')\n",
498
  "wetlands = ['Freshwater Emergent Wetland', 'Freshwater Forested/Shrub Wetland', 'Estuarine and Marine Wetland']\n",
499
  "gdf = gdf[gdf['WETLAND_TYPE'].isin(wetlands)]\n",
500
  "\n",
501
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
502
+ "convert_pmtiles(con, s3, folder =folder, file = f\"{name}.parquet\")\n",
503
+ "geom_to_h3(con, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
 
504
  ]
505
  },
506
  {
 
599
  "outputs": [],
600
  "source": [
601
  "%%time \n",
602
+ "con = ibis.duckdb.connect('farm',extensions = [\"spatial\", \"h3\"])\n",
603
+ "set_secrets(con)\n",
604
  "\n",
605
  "folder = 'NBS_agriculture/Farmland'\n",
606
+ "unzip(s3, folder = folder, file = 'Important_Farmland_2018.zip')\n",
607
  "\n",
608
  "folder = 'NBS_agriculture/Farmland_all'\n",
609
  "name = 'Important_Farmland_2018'\n",
610
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.gdb\",crs = \"epsg:4326\")\n",
611
+ "convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
612
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n",
613
  "\n",
614
  "# only pick a subset \n",
615
  "folder = 'NBS_agriculture/Farmland_all/Farmland'\n",
616
  "name = 'Farmland_2018'\n",
617
+ "gdf = gpd.read_file('Important_Farmland_2018.gdb')\n",
618
+ "farmland_type = ['P','S','L','U'] # prime, statewide importance, local importance, unique\n",
619
+ "gdf_farmland = gdf[gdf['polygon_ty'].isin(farmland_type)]\n",
620
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_farmland)\n",
621
+ "convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
 
 
622
  "\n",
623
  "# grazing lands \n",
624
  "folder = 'NBS_agriculture/Farmland_all/Lands_suitable_grazing'\n",
625
  "name = 'Grazing_land_2018'\n",
626
+ "gdf_grazing = gdf[gdf['polygon_ty'] == 'G']\n",
627
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_grazing)\n",
628
+ "convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n"
 
629
  ]
630
  },
631
  {
 
662
  "Only YEAR >= 2014. "
663
  ]
664
  },
 
 
 
 
 
 
 
 
665
  {
666
  "cell_type": "code",
667
  "execution_count": null,
 
673
  "folder = 'Climate_risks/Historical_fire_perimeters'\n",
674
  "name = 'calfire_2023'\n",
675
  "\n",
676
+ "unzip(s3, folder = folder, file = 'fire23-1gdb.zip')\n",
677
  "gdf = gpd.read_file('fire23_1.gdb')\n",
678
+ "# gdf = gdf[~gdf['YEAR_'].isna()]\n",
679
+ "# gdf['YEAR_'] = gdf['YEAR_'].astype('int64')\n",
680
  "# gdf = gdf[gdf['YEAR_']>=2014]\n",
681
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
682
+ "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
683
  "\n",
684
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)"
685
  ]
686
  },
687
  {
 
730
  "Do seperately for both climate models - CNRM and MIROC.\n",
731
  "'''\n",
732
  "\n",
733
+ "unzip(s3, folder = 'Climate_risks/Mid-century_habitat_climate_exposure', file = 'Midcentury_habitat_climate_exposure.zip')\n",
734
  "\n",
735
  "# still need to do "
736
  ]
 
762
  "folder = 'Progress_data_new_protection/Newly_counted_lands'\n",
763
  "name = 'newly_counted_lands_2024'\n",
764
  "\n",
765
+ "unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
766
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
767
+ "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
768
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
769
  ]
770
  },
771
  {
 
787
  "folder = 'Progress_data_new_protection/DAC'\n",
788
  "name = 'DAC_2022'\n",
789
  "\n",
790
+ "unzip(s3, folder = folder, file = 'sb535dacgdbf2022gdb.zip')\n",
791
+ "cols = process_vector(s3, folder = folder, file = 'SB535DACgdb_F_2022.gdb', file_name = f\"{name}.parquet\")\n",
792
+ "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
793
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
794
  ]
795
  },
796
  {
 
814
  "\n",
815
  "folder = 'Progress_data_new_protection/Priority_populations'\n",
816
  "name = 'CalEnviroScreen4'\n",
817
+ "unzip(s3, folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
818
  "\n",
819
  "gdf = (con.read_geo('Priority Populations 4.0 Combined Layer.gdb')\n",
820
  " .mutate(id=ibis.row_number().over()) #making a unique id \n",
821
  " ).execute().set_crs('EPSG:3857')\n",
822
  "\n",
823
+ "cols = process_vector(folder = folder, file = 'Priority Populations 4.0 Combined Layer.gdb',\n",
824
  " file_name = f\"{name}.parquet\", gdf = gdf)\n",
825
  "\n",
826
+ "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
827
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
  ]
829
  },
830
  {
 
845
  "folder = 'Progress_data_new_protection/Low_income_communities'\n",
846
  "name = 'low_income_CalEnviroScreen4'\n",
847
  "\n",
848
+ "unzip(s3, folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
849
  "\n",
850
  "gdf = gpd.read_file('Priority Populations 4.0 Combined Layer.gdb')\n",
851
  "gdf = gdf[gdf['Designatio'] =='Low-income community']\n",
852
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
853
+ "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")"
854
  ]
855
  },
856
  {
 
871
  "folder = 'Progress_data_new_protection/Land_Status_Zone_Ecoregion_Counties'\n",
872
  "name = 'all_regions_reGAP_county_eco'\n",
873
  "\n",
874
+ "unzip(s3, folder = folder, file = 'Land_Status_Zone_Ecoregion_Counties.shp.zip')\n",
875
+ "cols = process_vector(s3, folder = folder, file = 'Land_Status_Zone_Ecoregion_Counties.shp',\n",
876
  " file_name = f\"{name}.parquet\")\n",
877
+ "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")"
878
  ]
879
  },
880
  {
 
885
  "# CA Nature data"
886
  ]
887
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888
  {
889
  "cell_type": "code",
890
  "execution_count": null,
 
902
  "# download(folder = folder, file = f\"{name}.parquet\")\n",
903
  "\n",
904
  "# gdf = gpd.read_parquet(f\"{name}.parquet\")\n",
905
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\")\n",
906
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
 
 
 
907
  ]
908
  },
909
  {
910
+ "cell_type": "markdown",
911
+ "id": "af486c71-3b84-4685-9794-fbacbf5f81c7",
 
912
  "metadata": {},
 
913
  "source": [
914
+ "# CPAD"
 
 
 
915
  ]
916
  },
917
  {
918
  "cell_type": "code",
919
  "execution_count": null,
920
+ "id": "cf6c896f-65f3-403a-abd9-f7dec2f4f112",
921
  "metadata": {},
922
  "outputs": [],
923
  "source": [
924
+ "con = ibis.duckdb.connect('cpad',extensions = [\"spatial\", \"h3\"])\n",
925
+ "set_secrets(con)\n",
926
+ "\n",
927
+ "folder = 'cpad'\n",
928
+ "name = 'cced_2024b_release'\n",
929
+ "\n",
930
+ "# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
931
+ "# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
932
+ "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
933
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
934
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols)\n",
935
+ "\n",
936
+ "name = 'cpad_2024b_release'\n",
937
+ "# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
938
+ "# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
939
+ "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
940
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
941
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols)"
942
  ]
 
 
 
 
 
 
 
 
943
  }
944
  ],
945
  "metadata": {
preprocess/h3_utils.py CHANGED
@@ -1,5 +1,16 @@
 
1
 
2
- def geom_to_h3(con, name, cols, zoom):
 
 
 
 
 
 
 
 
 
 
3
  """
4
  Computes hexes
5
  """
@@ -38,38 +49,36 @@ def check_size(con, name, zoom, sample_size=100):
38
 
39
  return est_total_h3, max_len
40
 
 
 
41
 
42
- def write_large_geoms(con, s3, bucket, path, name, zoom="8", geom_len_threshold=10_000):
 
 
43
  """
44
  Individually processing large geoms (different from processing "chunks")
45
  """
46
  offset = 0
47
  i = 0
48
- limit=3000
49
  while True:
50
- large_key = f"{path}/hex/{name}_large_{i:03d}.parquet"
51
- print(f"🟠 Checking large geometry batch {i} → {large_key}")
52
 
53
- # check if file already exists in minio
54
- try:
55
- s3.stat_object(bucket, large_key)
56
- print(f"⏩ Skipping existing large batch: {large_key}")
57
- offset += limit
58
  i += 1
59
  continue
60
- except S3Error as err:
61
- if err.code != "NoSuchKey":
62
- raise
63
 
64
- print(f"📝 Writing large geometry batch {i} → {large_key}")
65
  q = con.sql(f'''
66
  SELECT *, UNNEST(h{zoom}) AS h{zoom}
67
  FROM t2
68
  WHERE len(h{zoom}) > {geom_len_threshold}
69
- LIMIT {limit} OFFSET {offset}
70
  ''')
71
 
72
- q.to_parquet(f"s3://{bucket}/{large_key}")
73
 
74
  if q.count().execute() == 0:
75
  break
@@ -79,7 +88,6 @@ def write_large_geoms(con, s3, bucket, path, name, zoom="8", geom_len_threshold=
79
 
80
  return i
81
 
82
-
83
  def join_large_geoms(con, s3, bucket, path, name):
84
  """
85
  If we had to process large geoms individually, join those datasets after conversion.
@@ -87,14 +95,10 @@ def join_large_geoms(con, s3, bucket, path, name):
87
  # check if any large files exist before trying to join
88
  test_key = f"{path}/hex/{name}_large_000.parquet"
89
 
90
- try:
91
- s3.stat_object(bucket, test_key)
92
- except S3Error as err:
93
- if err.code == "NoSuchKey":
94
- print("✅ No large geometry chunks to join.")
95
- return
96
- else:
97
- raise
98
  # join if it exists
99
  con.raw_sql(f'''
100
  COPY (
@@ -103,27 +107,24 @@ def join_large_geoms(con, s3, bucket, path, name):
103
  TO 's3://{bucket}/{path}/hex/{name}_large.parquet'
104
  (FORMAT PARQUET)
105
  ''')
 
106
 
107
-
108
- def chunk_data(con, s3, bucket, path, name, zoom="8", limit=100_000, geom_len_threshold=10_000):
109
  """
110
  Processing large files in chunks.
111
  """
112
  offset = 0
113
  i = 0
114
-
115
  while True:
116
  chunk_path = f"{path}/hex/{name}_chunk{i:03d}.parquet"
117
 
118
- try:
119
- s3.stat_object(bucket, chunk_path)
120
  print(f"⏩ Skipping existing chunk: {chunk_path}")
121
  offset += limit
122
  i += 1
123
  continue
124
- except S3Error as err:
125
- if err.code != "NoSuchKey":
126
- raise
127
 
128
  print(f"📝 Writing chunk {i} → {chunk_path}")
129
  q = con.sql(f'''
@@ -139,13 +140,13 @@ def chunk_data(con, s3, bucket, path, name, zoom="8", limit=100_000, geom_len_th
139
  i += 1
140
 
141
  # process large geometries using same threshold and limit
142
- write_large_geoms(con, s3, bucket, path, name, zoom, geom_len_threshold=geom_len_threshold)
143
  join_large_geoms(con, s3, bucket, path, name)
144
  return i
145
 
146
 
147
 
148
- def join_chunked(bucket, path, name):
149
  """
150
  If we had to chunk the data, join those datasets after conversion.
151
  """
@@ -158,7 +159,8 @@ def join_chunked(bucket, path, name):
158
  ''')
159
 
160
  # def convert_h3(con, folder, file, cols, zoom="8", limit=100_000, geom_len_threshold=10_000):
161
- def convert_h3(con, s3, folder, file, cols, zoom="8", limit=100_000, geom_len_threshold=5_000):
 
162
  """
163
  Driver function to convert geometries to h3
164
  """
@@ -175,14 +177,14 @@ def convert_h3(con, s3, folder, file, cols, zoom="8", limit=100_000, geom_len_th
175
  est_total, max_per_geom = check_size(con, name, zoom)
176
  # if est_total > 500_000 or max_per_geom > geom_len_threshold:
177
 
178
- if est_total > 1_000_000 or max_per_geom > geom_len_threshold:
179
  print("Chunking due to estimated size")
180
- geom_to_h3(con, name, cols, zoom)
181
- chunk_data(con, s3, bucket, path, name, zoom, limit, geom_len_threshold)
182
  join_chunked(con, bucket, path, name)
183
  else:
184
  print("Writing single output")
185
- geom_to_h3(con, name, cols, zoom)
186
  con.sql(f'''
187
  SELECT *, UNNEST(h{zoom}) AS h{zoom}
188
  FROM t2
 
1
+ from utils import *
2
 
3
+ # === CONFIG ===
4
+ default_zoom = "8"
5
+ default_limit = 10_000
6
+ default_geom_len_thresh = 5_000 # H3 cells per geometry
7
+ chunk_limit = default_limit
8
+ large_geom_thresh = default_geom_len_thresh
9
+ est_total_h3_thresh = 150_000
10
+ large_geom_batch_limit = 100
11
+
12
+
13
+ def compute_h3(con, name, cols, zoom):
14
  """
15
  Computes hexes
16
  """
 
49
 
50
  return est_total_h3, max_len
51
 
52
+ # def chunk_large_geom(con, s3, bucket, path, name, zoom=default_zoom, geom_len_threshold=large_geom_thresh):
53
+ # def chunk_large_geom(con, s3, bucket, path, name, zoom="8", geom_len_threshold=10_000):
54
 
55
+ def chunk_large_geom(con, s3, bucket, path, name, zoom=default_zoom,
56
+ geom_len_threshold=large_geom_thresh,
57
+ batch_limit=large_geom_batch_limit):
58
  """
59
  Individually processing large geoms (different from processing "chunks")
60
  """
61
  offset = 0
62
  i = 0
 
63
  while True:
64
+ relative_key = f"{path}/hex/{name}_large_{i:03d}.parquet"
65
+ print(f"🟠 Checking large geometry batch {i} → {relative_key}")
66
 
67
+ if exists_on_s3(s3, folder="", file=relative_key): # we pass relative_key as `file`
68
+ print(f"⏩ Skipping existing large batch: {relative_key}")
69
+ offset += batch_limit
 
 
70
  i += 1
71
  continue
 
 
 
72
 
73
+ print(f"📝 Writing large geometry batch {i} → {relative_key}")
74
  q = con.sql(f'''
75
  SELECT *, UNNEST(h{zoom}) AS h{zoom}
76
  FROM t2
77
  WHERE len(h{zoom}) > {geom_len_threshold}
78
+ LIMIT {batch_limit} OFFSET {offset}
79
  ''')
80
 
81
+ q.to_parquet(f"s3://{bucket}/{relative_key}")
82
 
83
  if q.count().execute() == 0:
84
  break
 
88
 
89
  return i
90
 
 
91
  def join_large_geoms(con, s3, bucket, path, name):
92
  """
93
  If we had to process large geoms individually, join those datasets after conversion.
 
95
  # check if any large files exist before trying to join
96
  test_key = f"{path}/hex/{name}_large_000.parquet"
97
 
98
+ if not exists_on_s3(s3, folder="", file=test_key):
99
+ print("✅ No large geometry chunks to join.")
100
+ return
101
+
 
 
 
 
102
  # join if it exists
103
  con.raw_sql(f'''
104
  COPY (
 
107
  TO 's3://{bucket}/{path}/hex/{name}_large.parquet'
108
  (FORMAT PARQUET)
109
  ''')
110
+
111
 
112
+ # def chunk_geom(con, s3, bucket, path, name, zoom="8", limit=50_000, geom_len_threshold=10_000):
113
+ def chunk_geom(con, s3, bucket, path, name, zoom=default_zoom, limit=chunk_limit, geom_len_threshold=large_geom_thresh):
114
  """
115
  Processing large files in chunks.
116
  """
117
  offset = 0
118
  i = 0
119
+
120
  while True:
121
  chunk_path = f"{path}/hex/{name}_chunk{i:03d}.parquet"
122
 
123
+ if exists_on_s3(s3, folder="", file=chunk_path): # relative path passed as file
 
124
  print(f"⏩ Skipping existing chunk: {chunk_path}")
125
  offset += limit
126
  i += 1
127
  continue
 
 
 
128
 
129
  print(f"📝 Writing chunk {i} → {chunk_path}")
130
  q = con.sql(f'''
 
140
  i += 1
141
 
142
  # process large geometries using same threshold and limit
143
+ chunk_large_geom(con, s3, bucket, path, name, zoom, geom_len_threshold=geom_len_threshold)
144
  join_large_geoms(con, s3, bucket, path, name)
145
  return i
146
 
147
 
148
 
149
+ def join_chunked(con, bucket, path, name):
150
  """
151
  If we had to chunk the data, join those datasets after conversion.
152
  """
 
159
  ''')
160
 
161
  # def convert_h3(con, folder, file, cols, zoom="8", limit=100_000, geom_len_threshold=10_000):
162
+ # def convert_h3(con, s3, folder, file, cols, zoom="8", limit=50_000, geom_len_threshold=5_000):
163
+ def convert_h3(con, s3, folder, file, cols, zoom=default_zoom, limit=chunk_limit, geom_len_threshold=large_geom_thresh):
164
  """
165
  Driver function to convert geometries to h3
166
  """
 
177
  est_total, max_per_geom = check_size(con, name, zoom)
178
  # if est_total > 500_000 or max_per_geom > geom_len_threshold:
179
 
180
+ if est_total > est_total_h3_thresh or max_per_geom > geom_len_threshold:
181
  print("Chunking due to estimated size")
182
+ compute_h3(con, name, cols, zoom)
183
+ chunk_geom(con, s3, bucket, path, name, zoom, limit, geom_len_threshold)
184
  join_chunked(con, bucket, path, name)
185
  else:
186
  print("Writing single output")
187
+ compute_h3(con, name, cols, zoom)
188
  con.sql(f'''
189
  SELECT *, UNNEST(h{zoom}) AS h{zoom}
190
  FROM t2
preprocess/utils.py CHANGED
@@ -1,4 +1,5 @@
1
  from minio.error import S3Error
 
2
 
3
  import zipfile
4
  import os
@@ -40,7 +41,7 @@ def upload(s3, folder, file):
40
  s3.fput_object(bucket, path ,file)
41
  return
42
 
43
- def unzip(folder, file):
44
  """
45
  Unzipping zip files
46
  """
@@ -49,8 +50,8 @@ def unzip(folder, file):
49
  zip_ref.extractall()
50
  return
51
 
52
- # def process_vector(folder, file, file_name = None, gdf = None, crs="EPSG:3310"):
53
- def process_vector(folder, file, file_name = None, gdf = None, crs="EPSG:4326"):
54
  """
55
  Driver function to process vectors
56
  """
@@ -67,20 +68,56 @@ def process_vector(folder, file, file_name = None, gdf = None, crs="EPSG:4326"):
67
  parquet_file = f"{name}{'.parquet'}"
68
  gdf.to_parquet(parquet_file)
69
  upload(s3, folder, parquet_file)
70
- return
71
-
72
 
73
- # def upload_parquet(folder, file, gdf):
74
- # """
75
- # Uploading parquets
76
- # """
77
- # name, ext = os.path.splitext(file)
78
- # parquet_file = f"{name}{'.parquet'}"
79
- # gdf.to_parquet(parquet_file)
80
- # upload(folder, parquet_file)
81
- # return
82
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def reproject_raster(input_file, crs="EPSG:3310"):
85
  """
86
  Reproject rasters
@@ -147,7 +184,7 @@ def make_vector(input_file, crs="EPSG:4326"):
147
 
148
  gdf.to_parquet(output_file)
149
  print(gdf)
150
- return output_file
151
 
152
  def filter_raster(s3, folder, file, percentile):
153
  """
@@ -168,31 +205,33 @@ def filter_raster(s3, folder, file, percentile):
168
  profile.update(dtype=rasterio.float64)
169
  with rasterio.open(new_file, "w", **profile) as dst:
170
  dst.write(filtered, 1)
171
- process_raster(s3, folder, file)
172
- return
173
 
174
- def process_raster(s3, folder, file, file_name = None):
175
- """
176
- Driver function to process rasters
177
- """
178
- if file_name:
179
- file = file_name
180
- output_file = reproject_raster(file)
181
- upload(s3, folder, output_file)
182
- output_cog_file = make_cog(output_file)
183
- upload(s3, folder, output_cog_file)
184
- output_vector = make_vector(output_file)
185
- upload(s3, folder, output_vector)
186
- return
187
-
188
- def convert_pmtiles(folder, file):
189
  """
190
  Convert to PMTiles with tippecanoe
191
  """
192
  name, ext = os.path.splitext(file)
193
  if ext != '.geojson':
194
- con.read_parquet(file).execute().set_crs('epsg:3310').to_crs('epsg:4326').to_file(name+'.geojson')
 
195
  to_pmtiles(name+'.geojson', name+'.pmtiles', options = ['--extend-zooms-if-still-dropping'])
196
  upload(s3, folder, name+'.pmtiles')
197
  return
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from minio.error import S3Error
2
+ from cng.utils import *
3
 
4
  import zipfile
5
  import os
 
41
  s3.fput_object(bucket, path ,file)
42
  return
43
 
44
+ def unzip(s3, folder, file):
45
  """
46
  Unzipping zip files
47
  """
 
50
  zip_ref.extractall()
51
  return
52
 
53
+ # def process_vector(s3, folder, file, file_name = None, gdf = None, crs="EPSG:3310"):
54
+ def process_vector(s3, folder, file, file_name = None, gdf = None, crs="EPSG:4326"):
55
  """
56
  Driver function to process vectors
57
  """
 
68
  parquet_file = f"{name}{'.parquet'}"
69
  gdf.to_parquet(parquet_file)
70
  upload(s3, folder, parquet_file)
 
 
71
 
72
+ return gdf.drop('geom',axis = 1).columns.to_list()
 
 
 
 
 
 
 
 
73
 
74
+ def process_raster(s3, folder, file, file_name = None):
75
+ """
76
+ Driver function to process rasters
77
+ """
78
+ if file_name:
79
+ file = file_name
80
+ # output_file = reproject_raster(file)
81
+ # upload(s3, folder, output_file)
82
+ # output_cog_file = make_cog(output_file)
83
+ # upload(s3, folder, output_cog_file)
84
+ # output_vector, cols = make_vector(output_file)
85
+ # upload(s3, folder, output_vector)
86
 
87
+ name, ext = os.path.splitext(file)
88
+ output_file = f"{name}_processed{ext}"
89
+
90
+ output_cog_file = f"{name}_processed_COG{ext}"
91
+
92
+ output_vector_file = f"{name}_processed.parquet"
93
+ print(output_file)
94
+ print(output_cog_file)
95
+ print(output_vector_file)
96
+ # Reproject raster
97
+ if not exists_on_s3(s3, folder, output_file):
98
+ output_file = reproject_raster(file)
99
+ upload(s3, folder, output_file)
100
+ else:
101
+ print(f"{output_file} already exists on S3, skipping reprojection/upload.")
102
+
103
+ # Make COG
104
+ if not exists_on_s3(s3, folder, output_cog_file):
105
+ output_cog_file = make_cog(output_file)
106
+ upload(s3, folder, output_cog_file)
107
+ else:
108
+ print(f"{output_cog_file} already exists on S3, skipping COG conversion/upload.")
109
+
110
+ # Vectorize raster
111
+ if not exists_on_s3(s3, folder, output_vector_file):
112
+ output_vector_file, cols = make_vector(output_file)
113
+ upload(s3, folder, output_vector_file)
114
+ else:
115
+ print(f"{output_vector_file} already exists on S3, skipping vectorization/upload.")
116
+ # We still need column names
117
+ gdf = gpd.read_parquet(output_vector_file)
118
+ cols = gdf.drop('geom', axis=1).columns.to_list()
119
+ return cols
120
+
121
  def reproject_raster(input_file, crs="EPSG:3310"):
122
  """
123
  Reproject rasters
 
184
 
185
  gdf.to_parquet(output_file)
186
  print(gdf)
187
+ return output_file, gdf.drop('geom',axis = 1).columns.to_list()
188
 
189
  def filter_raster(s3, folder, file, percentile):
190
  """
 
205
  profile.update(dtype=rasterio.float64)
206
  with rasterio.open(new_file, "w", **profile) as dst:
207
  dst.write(filtered, 1)
208
+ cols = process_raster(s3, folder, file)
209
+ return cols
210
 
211
+
212
+ def convert_pmtiles(con, s3, folder, file):
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  """
214
  Convert to PMTiles with tippecanoe
215
  """
216
  name, ext = os.path.splitext(file)
217
  if ext != '.geojson':
218
+ (con.read_parquet(file).execute().set_crs('epsg:3310')
219
+ .to_crs('epsg:4326').to_file(name+'.geojson'))
220
  to_pmtiles(name+'.geojson', name+'.pmtiles', options = ['--extend-zooms-if-still-dropping'])
221
  upload(s3, folder, name+'.pmtiles')
222
  return
223
 
224
+ def exists_on_s3(s3, folder, file):
225
+ """
226
+ Check if a file exists on S3
227
+ """
228
+ bucket, path = info(folder, file)
229
+ print(bucket)
230
+ print(path)
231
+
232
+ try:
233
+ s3.stat_object(bucket, path)
234
+ return True
235
+ except S3Error:
236
+ return False
237
+