cassiebuhler commited on
Commit
5a3c665
·
1 Parent(s): 6ab4321

fixed! now using different method for h3.

Browse files
preprocess/CBN-data.ipynb CHANGED
@@ -48,11 +48,11 @@
48
  "folder = 'Counties'\n",
49
  "name = 'CA_counties'\n",
50
  "\n",
51
- "unzip(s3, folder = folder, file = '30x30_Counties.zip')\n",
52
  "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
53
- "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
54
  "\n",
55
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)"
56
  ]
57
  },
58
  {
@@ -76,11 +76,10 @@
76
  "\n",
77
  "folder = 'Climate_zones'\n",
78
  "name = 'climate_zones_10'\n",
79
- "download(s3, folder = folder, file = 'clusters_10.tif')\n",
80
  "cols = process_raster(s3, folder = folder, file = 'clusters_10.tif', file_name = f\"{name}.tif\")\n",
81
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)\n",
82
- "\n",
83
- "\n"
84
  ]
85
  },
86
  {
@@ -105,10 +104,10 @@
105
  "folder = 'Ecoregion'\n",
106
  "name = 'ACE_ecoregions'\n",
107
  "\n",
108
- "unzip(s3, folder = folder, file = '30x30_Ecoregions.zip')\n",
109
  "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
110
  "\n",
111
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)"
112
  ]
113
  },
114
  {
@@ -127,24 +126,11 @@
127
  "#### 13 class major habitat types **"
128
  ]
129
  },
130
- {
131
- "cell_type": "code",
132
- "execution_count": null,
133
- "id": "df40e121-e2d4-4962-9c30-ed7e931bb705",
134
- "metadata": {},
135
- "outputs": [],
136
- "source": [
137
- "# download(folder = 'Habitat', file = 'CWHR13_2022.tif')\n",
138
- "# cols = process_raster(s3, folder = 'Habitat', file = 'CWHR13_2022.tif')"
139
- ]
140
- },
141
  {
142
  "cell_type": "code",
143
  "execution_count": null,
144
  "id": "de501ac3-f6fe-44f5-86c1-afba763147ae",
145
- "metadata": {
146
- "scrolled": true
147
- },
148
  "outputs": [],
149
  "source": [
150
  "%%time\n",
@@ -153,7 +139,6 @@
153
  "\n",
154
  "folder = 'Habitat'\n",
155
  "name = 'fveg22_1'\n",
156
- "\n",
157
  "# unzip(s3, folder = folder, file = 'fveg221gdb.zip')\n",
158
  "\n",
159
  "# command = [\n",
@@ -167,24 +152,8 @@
167
  "cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
168
  "# upload(folder = folder, file = f'{name}_processed.tif.aux.xml')\n",
169
  "\n",
170
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
171
- ]
172
- },
173
- {
174
- "cell_type": "code",
175
- "execution_count": null,
176
- "id": "ac178c43-f6a5-4286-a348-48bfcb1e9397",
177
- "metadata": {},
178
- "outputs": [],
179
- "source": [
180
- "# url = f\"s3://public-ca30x30/{folder}/{name}.parquet\"\n",
181
- "\n",
182
- "folder = 'Habitat'\n",
183
- "name = 'fveg22_1'\n",
184
- "url = f\"s3://public-ca30x30/CBN-data/{folder}/hex/{name}.parquet\"\n",
185
- "\n",
186
- "con.read_parquet(url).head(5).execute()\n",
187
- "\n"
188
  ]
189
  },
190
  {
@@ -223,7 +192,7 @@
223
  "cols = process_vector(s3, folder = folder, file = f\"{name}.geojson\")\n",
224
  "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.geojson\")\n",
225
  "\n",
226
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n",
227
  "# gdf = gpd.read_parquet(f\"{name}.parquet\")\n"
228
  ]
229
  },
@@ -332,7 +301,7 @@
332
  "\n",
333
  "# download(s3, folder = folder, file = f\"{name}.tif\")\n",
334
  "cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
335
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
336
  ]
337
  },
338
  {
@@ -359,7 +328,7 @@
359
  "\n",
360
  "download(s3, folder = folder, file = f\"{name}.tif\")\n",
361
  "cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
362
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
363
  ]
364
  },
365
  {
@@ -402,7 +371,8 @@
402
  "name = 'rcn_wIntactBioCat_caOnly_2020-10-27'\n",
403
  "\n",
404
  "cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
405
- "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols)"
 
406
  ]
407
  },
408
  {
@@ -491,7 +461,7 @@
491
  "\n",
492
  "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
493
  "convert_pmtiles(con, s3, folder =folder, file = f\"{name}.parquet\")\n",
494
- "geom_to_h3(con, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
495
  ]
496
  },
497
  {
@@ -594,29 +564,29 @@
594
  "set_secrets(con)\n",
595
  "\n",
596
  "folder = 'NBS_agriculture/Farmland'\n",
597
- "unzip(s3, folder = folder, file = 'Important_Farmland_2018.zip')\n",
598
  "\n",
599
  "folder = 'NBS_agriculture/Farmland_all'\n",
600
  "name = 'Important_Farmland_2018'\n",
601
  "cols = process_vector(s3, folder = folder, file = f\"{name}.gdb\",crs = \"epsg:4326\")\n",
602
- "convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
603
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n",
604
  "\n",
605
  "# only pick a subset \n",
606
  "folder = 'NBS_agriculture/Farmland_all/Farmland'\n",
607
  "name = 'Farmland_2018'\n",
608
- "gdf = gpd.read_file('Important_Farmland_2018.gdb')\n",
609
- "farmland_type = ['P','S','L','U'] # prime, statewide importance, local importance, unique\n",
610
- "gdf_farmland = gdf[gdf['polygon_ty'].isin(farmland_type)]\n",
611
- "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_farmland)\n",
612
- "convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
613
  "\n",
614
  "# grazing lands \n",
615
  "folder = 'NBS_agriculture/Farmland_all/Lands_suitable_grazing'\n",
616
  "name = 'Grazing_land_2018'\n",
617
- "gdf_grazing = gdf[gdf['polygon_ty'] == 'G']\n",
618
- "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_grazing)\n",
619
- "convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n"
620
  ]
621
  },
622
  {
@@ -673,9 +643,9 @@
673
  "# gdf['YEAR_'] = gdf['YEAR_'].astype('int64')\n",
674
  "# gdf = gdf[gdf['YEAR_']>=2014]\n",
675
  "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
676
- "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
677
  "\n",
678
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)"
679
  ]
680
  },
681
  {
@@ -759,10 +729,10 @@
759
  "folder = 'Progress_data_new_protection/Newly_counted_lands'\n",
760
  "name = 'newly_counted_lands_2024'\n",
761
  "\n",
762
- "unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
763
- "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
764
- "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
765
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
766
  ]
767
  },
768
  {
@@ -790,7 +760,7 @@
790
  "unzip(s3, folder = folder, file = 'sb535dacgdbf2022gdb.zip')\n",
791
  "cols = process_vector(s3, folder = folder, file = 'SB535DACgdb_F_2022.gdb', file_name = f\"{name}.parquet\")\n",
792
  "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
793
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
794
  ]
795
  },
796
  {
@@ -814,17 +784,17 @@
814
  "\n",
815
  "folder = 'Progress_data_new_protection/Priority_populations'\n",
816
  "name = 'CalEnviroScreen4'\n",
817
- "unzip(s3, folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
818
  "\n",
819
  "gdf = (con.read_geo('Priority Populations 4.0 Combined Layer.gdb')\n",
820
  " .mutate(id=ibis.row_number().over()) #making a unique id \n",
821
  " ).execute().set_crs('EPSG:3857')\n",
822
  "\n",
823
- "cols = process_vector(folder = folder, file = 'Priority Populations 4.0 Combined Layer.gdb',\n",
824
  " file_name = f\"{name}.parquet\", gdf = gdf)\n",
825
  "\n",
826
- "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
827
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
828
  ]
829
  },
830
  {
@@ -903,13 +873,14 @@
903
  "set_secrets(con)\n",
904
  "\n",
905
  "# file = 'ca-30x30-base.parquet'\n",
906
- "folder = \"Preprocessing\"\n",
907
  "name = 'ca-30x30-base'\n",
908
- "# download(folder = folder, file = f\"{name}.parquet\")\n",
909
  "\n",
910
  "# gdf = gpd.read_parquet(f\"{name}.parquet\")\n",
911
- "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\")\n",
912
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols)\n"
 
913
  ]
914
  },
915
  {
@@ -930,21 +901,21 @@
930
  "con = ibis.duckdb.connect('cpad',extensions = [\"spatial\", \"h3\"])\n",
931
  "set_secrets(con)\n",
932
  "\n",
933
- "folder = 'cpad'\n",
934
  "name = 'cced_2024b_release'\n",
935
  "\n",
936
  "# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
937
  "# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
938
  "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
939
  "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
940
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols)\n",
941
  "\n",
942
  "name = 'cpad_2024b_release'\n",
943
  "# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
944
  "# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
945
  "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
946
  "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
947
- "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols)"
948
  ]
949
  }
950
  ],
 
48
  "folder = 'Counties'\n",
49
  "name = 'CA_counties'\n",
50
  "\n",
51
+ "# unzip(s3, folder = folder, file = '30x30_Counties.zip')\n",
52
  "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
53
+ "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
54
  "\n",
55
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)"
56
  ]
57
  },
58
  {
 
76
  "\n",
77
  "folder = 'Climate_zones'\n",
78
  "name = 'climate_zones_10'\n",
79
+ "# download(s3, folder = folder, file = 'clusters_10.tif')\n",
80
  "cols = process_raster(s3, folder = folder, file = 'clusters_10.tif', file_name = f\"{name}.tif\")\n",
81
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols,\n",
82
+ " zoom = 8)\n"
 
83
  ]
84
  },
85
  {
 
104
  "folder = 'Ecoregion'\n",
105
  "name = 'ACE_ecoregions'\n",
106
  "\n",
107
+ "# unzip(s3, folder = folder, file = '30x30_Ecoregions.zip')\n",
108
  "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\")\n",
109
  "\n",
110
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)"
111
  ]
112
  },
113
  {
 
126
  "#### 13 class major habitat types **"
127
  ]
128
  },
 
 
 
 
 
 
 
 
 
 
 
129
  {
130
  "cell_type": "code",
131
  "execution_count": null,
132
  "id": "de501ac3-f6fe-44f5-86c1-afba763147ae",
133
+ "metadata": {},
 
 
134
  "outputs": [],
135
  "source": [
136
  "%%time\n",
 
139
  "\n",
140
  "folder = 'Habitat'\n",
141
  "name = 'fveg22_1'\n",
 
142
  "# unzip(s3, folder = folder, file = 'fveg221gdb.zip')\n",
143
  "\n",
144
  "# command = [\n",
 
152
  "cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
153
  "# upload(folder = folder, file = f'{name}_processed.tif.aux.xml')\n",
154
  "\n",
155
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols,\n",
156
+ " zoom = 8)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  ]
158
  },
159
  {
 
192
  "cols = process_vector(s3, folder = folder, file = f\"{name}.geojson\")\n",
193
  "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.geojson\")\n",
194
  "\n",
195
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n",
196
  "# gdf = gpd.read_parquet(f\"{name}.parquet\")\n"
197
  ]
198
  },
 
301
  "\n",
302
  "# download(s3, folder = folder, file = f\"{name}.tif\")\n",
303
  "cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
304
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols, zoom = 8)"
305
  ]
306
  },
307
  {
 
328
  "\n",
329
  "download(s3, folder = folder, file = f\"{name}.tif\")\n",
330
  "cols = filter_raster(s3, folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
331
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols, zoom = 8)"
332
  ]
333
  },
334
  {
 
371
  "name = 'rcn_wIntactBioCat_caOnly_2020-10-27'\n",
372
  "\n",
373
  "cols = process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
374
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols = cols, \n",
375
+ " zoom = 8)"
376
  ]
377
  },
378
  {
 
461
  "\n",
462
  "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
463
  "convert_pmtiles(con, s3, folder =folder, file = f\"{name}.parquet\")\n",
464
+ "geom_to_h3(con, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n"
465
  ]
466
  },
467
  {
 
564
  "set_secrets(con)\n",
565
  "\n",
566
  "folder = 'NBS_agriculture/Farmland'\n",
567
+ "# unzip(s3, folder = folder, file = 'Important_Farmland_2018.zip')\n",
568
  "\n",
569
  "folder = 'NBS_agriculture/Farmland_all'\n",
570
  "name = 'Important_Farmland_2018'\n",
571
  "cols = process_vector(s3, folder = folder, file = f\"{name}.gdb\",crs = \"epsg:4326\")\n",
572
+ "# convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
573
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n",
574
  "\n",
575
  "# only pick a subset \n",
576
  "folder = 'NBS_agriculture/Farmland_all/Farmland'\n",
577
  "name = 'Farmland_2018'\n",
578
+ "# gdf = gpd.read_file('Important_Farmland_2018.gdb')\n",
579
+ "# farmland_type = ['P','S','L','U'] # prime, statewide importance, local importance, unique\n",
580
+ "# gdf_farmland = gdf[gdf['polygon_ty'].isin(farmland_type)]\n",
581
+ "# cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_farmland)\n",
582
+ "# convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n",
583
  "\n",
584
  "# grazing lands \n",
585
  "folder = 'NBS_agriculture/Farmland_all/Lands_suitable_grazing'\n",
586
  "name = 'Grazing_land_2018'\n",
587
+ "# gdf_grazing = gdf[gdf['polygon_ty'] == 'G']\n",
588
+ "# cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf_grazing)\n",
589
+ "# convert_pmtiles(con, s3, folder = folder, file =f\"{name}.parquet\")\n"
590
  ]
591
  },
592
  {
 
643
  "# gdf['YEAR_'] = gdf['YEAR_'].astype('int64')\n",
644
  "# gdf = gdf[gdf['YEAR_']>=2014]\n",
645
  "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
646
+ "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
647
  "\n",
648
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)"
649
  ]
650
  },
651
  {
 
729
  "folder = 'Progress_data_new_protection/Newly_counted_lands'\n",
730
  "name = 'newly_counted_lands_2024'\n",
731
  "\n",
732
+ "# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
733
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\",crs = \"epsg:4326\")\n",
734
+ "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
735
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n"
736
  ]
737
  },
738
  {
 
760
  "unzip(s3, folder = folder, file = 'sb535dacgdbf2022gdb.zip')\n",
761
  "cols = process_vector(s3, folder = folder, file = 'SB535DACgdb_F_2022.gdb', file_name = f\"{name}.parquet\")\n",
762
  "convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
763
+ "# convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n"
764
  ]
765
  },
766
  {
 
784
  "\n",
785
  "folder = 'Progress_data_new_protection/Priority_populations'\n",
786
  "name = 'CalEnviroScreen4'\n",
787
+ "# unzip(s3, folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
788
  "\n",
789
  "gdf = (con.read_geo('Priority Populations 4.0 Combined Layer.gdb')\n",
790
  " .mutate(id=ibis.row_number().over()) #making a unique id \n",
791
  " ).execute().set_crs('EPSG:3857')\n",
792
  "\n",
793
+ "cols = process_vector(s3, folder = folder, file = 'Priority Populations 4.0 Combined Layer.gdb',\n",
794
  " file_name = f\"{name}.parquet\", gdf = gdf)\n",
795
  "\n",
796
+ "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
797
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, zoom = 8)\n"
798
  ]
799
  },
800
  {
 
873
  "set_secrets(con)\n",
874
  "\n",
875
  "# file = 'ca-30x30-base.parquet'\n",
876
+ "folder = \"CA_Nature/2024/Preprocessing\"\n",
877
  "name = 'ca-30x30-base'\n",
878
+ "download(s3, folder = folder, file = f\"{name}.parquet\")\n",
879
  "\n",
880
  "# gdf = gpd.read_parquet(f\"{name}.parquet\")\n",
881
+ "cols = process_vector(s3, folder = folder, file = f\"{name}.parquet\", crs=\"EPSG:4326\")\n",
882
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols = cols, \n",
883
+ " zoom = 8)\n"
884
  ]
885
  },
886
  {
 
901
  "con = ibis.duckdb.connect('cpad',extensions = [\"spatial\", \"h3\"])\n",
902
  "set_secrets(con)\n",
903
  "\n",
904
+ "folder = 'CPAD'\n",
905
  "name = 'cced_2024b_release'\n",
906
  "\n",
907
  "# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
908
  "# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
909
  "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
910
  "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
911
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols, zoom = 8)\n",
912
  "\n",
913
  "name = 'cpad_2024b_release'\n",
914
  "# unzip(s3, folder = folder, file = f\"{name}.shp.zip\")\n",
915
  "# cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:3310\")\n",
916
  "# convert_pmtiles(con, s3, folder = folder, file = f\"{name}.parquet\")\n",
917
  "cols = process_vector(s3, folder = folder, file = f\"{name}.shp\", crs=\"EPSG:4326\")\n",
918
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= cols, zoom = 8)"
919
  ]
920
  }
921
  ],
preprocess/h3_utils.py CHANGED
@@ -1,14 +1,63 @@
1
  from utils import *
 
2
 
3
- default_zoom = "8"
4
- max_h3_n = 1_000_000 # if est total H3 cells > this -> process in chunks
5
- chunk_n = 10_000 # chunk size (# of geoms)
6
- big_n = 10_000 # if geoms > big_n -> they are "big" and processed individually
7
- batch_n = 5_000 # big geoms processed in batches of this size
 
 
 
 
 
 
 
8
 
9
- def compute_h3(con, name, cols, zoom):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  """
11
- Computes hexes
12
  """
13
  con.raw_sql(f'''
14
  CREATE OR REPLACE TEMP TABLE t2 AS
@@ -17,161 +66,14 @@ def compute_h3(con, name, cols, zoom):
17
  FROM {name}
18
  )
19
  SELECT {cols},
20
- h3_polygon_wkt_to_cells_string(UNNEST(geom).geom, {zoom}) AS h{zoom}
21
  FROM t1
22
  ''')
23
-
24
- def check_size(con, name, zoom, sample_size):
25
- """
26
- Estimating size of geoms to decide if we need to process in chunks
27
- """
28
- query = f"""
29
- SELECT
30
- avg(len(h3_polygon_wkt_to_cells_string(ST_AsText(geom), {zoom}))::DOUBLE) AS avg_h3_len,
31
- max(len(h3_polygon_wkt_to_cells_string(ST_AsText(geom), {zoom}))) AS max_h3_len,
32
- count(*) AS total_rows
33
- FROM {name}
34
- USING SAMPLE {sample_size}
35
- """
36
- stats = con.sql(query).execute()
37
- avg_len = stats.iloc[0]['avg_h3_len']
38
- max_len = stats.iloc[0]['max_h3_len']
39
- total_rows = con.table(name).count().execute()
40
-
41
- est_total_h3 = avg_len * total_rows
42
-
43
- print(f"Estimated total H3 cells: {est_total_h3:,.0f}")
44
- print(f"Max H3 cells in one geometry: {max_len:,}")
45
-
46
- return est_total_h3, max_len
47
-
48
- def chunk_large_geom(con, s3, bucket, path, name, zoom, big_n, batch_limit):
49
- """
50
- Individually processing large geoms (different from processing "chunks")
51
- """
52
- offset = 0
53
- i = 0
54
- while True:
55
- relative_key = f"{path}/hex/zoom{zoom}/{name}_large_{i:03d}.parquet"
56
- print(f"🟠 Checking large geometry batch {i} → {relative_key}")
57
-
58
- if exists_on_s3(s3, folder="", file=relative_key): # we pass relative_key as `file`
59
- print(f"⏩ Skipping existing large batch: {relative_key}")
60
- offset += batch_limit
61
- i += 1
62
- continue
63
 
64
- print(f"📝 Writing large geometry batch {i} → {relative_key}")
65
- q = con.sql(f'''
66
- SELECT *, UNNEST(h{zoom}) AS h{zoom}
67
- FROM t2
68
- WHERE len(h{zoom}) > {big_n}
69
- LIMIT {batch_limit} OFFSET {offset}
70
- ''')
71
-
72
- q.to_parquet(f"s3://{bucket}/{relative_key}")
73
-
74
- if q.count().execute() == 0:
75
- break
76
-
77
- offset += batch_limit
78
- i += 1
79
-
80
- return i
81
-
82
- def join_large_geoms(con, s3, bucket, path, name, zoom):
83
- """
84
- If we had to process large geoms individually, join those datasets after conversion.
85
- """
86
- # check if any large files exist before trying to join
87
- test_key = f"{path}/hex/zoom{zoom}/{name}_large_000.parquet"
88
 
89
- if not exists_on_s3(s3, folder="", file=test_key):
90
- print("✅ No large geometry chunks to join.")
91
- return
92
-
93
- # join if it exists
94
  con.raw_sql(f'''
95
- COPY (
96
- SELECT * FROM read_parquet('s3://{bucket}/{path}/hex/zoom{zoom}/{name}_large_*.parquet')
97
- )
98
- TO 's3://{bucket}/{path}/hex/zoom{zoom}/{name}_large.parquet'
99
- (FORMAT PARQUET)
100
  ''')
101
-
102
-
103
- def chunk_geom(con, s3, bucket, path, name, zoom, limit, batch_limit, big_n):
104
- """
105
- Processing files in chunks.
106
- """
107
- offset = 0
108
- i = 0
109
-
110
- while True:
111
- chunk_path = f"{path}/hex/zoom{zoom}/{name}_chunk{i:03d}.parquet"
112
-
113
- if exists_on_s3(s3, folder="", file=chunk_path): # relative path passed as file
114
- print(f"⏩ Skipping existing chunk: {chunk_path}")
115
- offset += limit
116
- i += 1
117
- continue
118
-
119
- print(f"📝 Writing chunk {i} → {chunk_path}")
120
- q = con.sql(f'''
121
- SELECT *, UNNEST(h{zoom}) AS h{zoom}
122
- FROM t2
123
- WHERE len(h{zoom}) <= {big_n}
124
- LIMIT {limit} OFFSET {offset}
125
- ''')
126
- q.to_parquet(f"s3://{bucket}/{chunk_path}")
127
- if q.count().execute() == 0:
128
- break
129
- offset += limit
130
- i += 1
131
-
132
- # process large geometries using same threshold and limit
133
- chunk_large_geom(con, s3, bucket, path, name, zoom, big_n, batch_limit)
134
- join_large_geoms(con, s3, bucket, path, name, zoom)
135
- return i
136
-
137
-
138
-
139
- def join_chunked(con, bucket, path, name, zoom):
140
- """
141
- If we had to chunk the data, join those datasets after conversion.
142
- """
143
- con.raw_sql(f'''
144
- COPY (
145
- SELECT * FROM read_parquet('s3://{bucket}/{path}/hex/zoom{zoom}/{name}_chunk*.parquet')
146
- )
147
- TO 's3://{bucket}/{path}/hex/zoom{zoom}/{name}.parquet'
148
- (FORMAT PARQUET)
149
- ''')
150
-
151
- def convert_h3(con, s3, folder, file, cols, zoom=default_zoom, limit=chunk_n, batch_limit = batch_n, big_n=big_n, max_h3_n = max_h3_n):
152
- """
153
- Driver function to convert geometries to h3
154
- """
155
- cols = ", ".join(cols) if isinstance(cols, list) else cols
156
- bucket, path = info(folder, file)
157
- path, file = os.path.split(path)
158
- name, ext = os.path.splitext(file)
159
- name = name.replace('-', '')
160
-
161
- print(f"Processing: {name}")
162
- con.read_parquet(f"s3://{bucket}/{path}/{file}", table_name=name)
163
-
164
- # Decide to chunk or not
165
- est_total, max_per_geom = check_size(con, name, zoom, sample_size=100)
166
- if est_total > max_h3_n or max_per_geom > big_n:
167
- print("Chunking due to estimated size")
168
- compute_h3(con, name, cols, zoom)
169
- chunk_geom(con, s3, bucket, path, name, zoom, limit, batch_limit, big_n)
170
- join_chunked(con, bucket, path, name, zoom)
171
- else:
172
- print("Writing single output")
173
- compute_h3(con, name, cols, zoom)
174
- con.sql(f'''
175
- SELECT *, UNNEST(h{zoom}) AS h{zoom}
176
- FROM t2
177
- ''').to_parquet(f"s3://{bucket}/{path}/hex/zoom{zoom}/{name}.parquet")
 
1
  from utils import *
2
+ import re
3
 
4
+ def convert_h3(con, s3, folder, file, cols, zoom):
5
+ """
6
+ Driver function to convert geometries to h3.
7
+ If no zoom levels exist -> compute from geometry at target zoom.
8
+ If lower zoom exists -> compute children from max available until target zoom.
9
+ """
10
+ cols = ", ".join(cols) if isinstance(cols, list) else cols
11
+ bucket, path = info(folder, file)
12
+ path, file = os.path.split(path)
13
+ name, ext = os.path.splitext(file)
14
+ name = name.replace('-', '')
15
+ print(f"Processing: {name}")
16
 
17
+ hex_paths = s3.list_objects(bucket, prefix=f"{path}/hex/", recursive=True)
18
+ zooms = []
19
+ # check what zooms exist
20
+ for obj in hex_paths:
21
+ match = re.search(r"/zoom(\d{1,2})/", obj.object_name)
22
+ if match:
23
+ zooms.append(int(match.group(1)))
24
+
25
+ if not zooms: # if no h3 files exist
26
+ print(f'No h3 files exists, computing {zoom} from geometry.')
27
+ con.read_parquet(f"s3://{bucket}/{path}/{file}", table_name=name)
28
+ h3_from_geom(con, name, cols, zoom)
29
+ con.sql(f'''
30
+ SELECT {cols}, UNNEST(h{zoom}) AS h{zoom}
31
+ FROM t2
32
+ ''').to_parquet(f"s3://{bucket}/{path}/hex/zoom{zoom}/{name}.parquet")
33
+
34
+ else:
35
+ current_zoom = max(zooms)
36
+
37
+ if zoom in zooms:
38
+ print(f'Zoom {zoom} already exists!')
39
+ return
40
+
41
+ elif current_zoom < zoom: #compute child of most refined zoom level
42
+ print(f'Reading zoom {current_zoom}')
43
+ con.read_parquet(
44
+ f"s3://{bucket}/{path}/hex/zoom{current_zoom}/{name}.parquet",
45
+ table_name=f"h3_h{current_zoom}"
46
+ )
47
+ print(f'Computing {zoom} from {current_zoom}')
48
+
49
+ for z in range(current_zoom + 1, zoom + 1):
50
+ print(f'Current zoom {z}')
51
+ h3_from_parent(con, z)
52
+ con.sql(f'''
53
+ SELECT *, UNNEST(h3_cell_to_children(h{z-1}, {z})) AS h{z}
54
+ FROM h3_h{z-1}
55
+ ''').to_parquet(f"s3://{bucket}/{path}/hex/zoom{z}/{name}.parquet")
56
+
57
+
58
+ def h3_from_geom(con, name, cols, zoom):
59
  """
60
+ Computes hexes directly from geometry.
61
  """
62
  con.raw_sql(f'''
63
  CREATE OR REPLACE TEMP TABLE t2 AS
 
66
  FROM {name}
67
  )
68
  SELECT {cols},
69
+ h3_polygon_wkt_to_cells_string(ST_Force2D(UNNEST(geom).geom), {zoom}) AS h{zoom}
70
  FROM t1
71
  ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ def h3_from_parent(con, zoom):
 
 
 
 
75
  con.raw_sql(f'''
76
+ CREATE OR REPLACE TEMP TABLE h3_h{zoom} AS
77
+ SELECT *, UNNEST(h3_cell_to_children(h{zoom-1}, {zoom})) AS h{zoom}
78
+ FROM h3_h{zoom-1}
 
 
79
  ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess/preprocess.ipynb CHANGED
@@ -10,7 +10,7 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": 1,
14
  "id": "f7e6298c-d886-432a-a1b7-c3fee914c24f",
15
  "metadata": {
16
  "editable": true,
@@ -48,12 +48,12 @@
48
  },
49
  {
50
  "cell_type": "code",
51
- "execution_count": 2,
52
  "id": "63dd33b8-6d3c-4852-9899-6ed5775d19c0",
53
  "metadata": {},
54
  "outputs": [],
55
  "source": [
56
- "def get_url(folder, file, base_folder = 'CBN-data'):\n",
57
  " minio = 'https://minio.carlboettiger.info/'\n",
58
  " bucket = 'public-ca30x30'\n",
59
  " if base_folder is None:\n",
@@ -80,7 +80,7 @@
80
  },
81
  {
82
  "cell_type": "code",
83
- "execution_count": 3,
84
  "id": "13214bbe-3a74-4247-981f-5a6eb6c486f5",
85
  "metadata": {},
86
  "outputs": [],
@@ -90,7 +90,7 @@
90
  "# ca_raw_parquet = 'ca_areas.parquet'\n",
91
  "\n",
92
  "# Boundary of CA, used to computed 'non-conserved' areas\n",
93
- "ca_boundary_parquet = get_url('Preprocessing','ca_boundary.parquet',base_folder = None)\n",
94
  "\n",
95
  "# newly protected areas \n",
96
  "newly_protected = get_url('Progress_data_new_protection/Newly_counted_lands','newly_counted_lands_2024.parquet')\n",
@@ -167,43 +167,10 @@
167
  },
168
  {
169
  "cell_type": "code",
170
- "execution_count": 7,
171
  "id": "0f9666d1-7c2b-45af-9399-e4189bba34f5",
172
  "metadata": {},
173
- "outputs": [
174
- {
175
- "data": {
176
- "application/vnd.jupyter.widget-view+json": {
177
- "model_id": "52ef18913f17417299860d91e36e9dbd",
178
- "version_major": 2,
179
- "version_minor": 0
180
- },
181
- "text/plain": [
182
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
183
- ]
184
- },
185
- "metadata": {},
186
- "output_type": "display_data"
187
- },
188
- {
189
- "name": "stdout",
190
- "output_type": "stream",
191
- "text": [
192
- "CPU times: user 4min 28s, sys: 6.1 s, total: 4min 34s\n",
193
- "Wall time: 2min 18s\n"
194
- ]
195
- },
196
- {
197
- "data": {
198
- "text/plain": [
199
- "<minio.helpers.ObjectWriteResult at 0x7ff0943c7710>"
200
- ]
201
- },
202
- "execution_count": 7,
203
- "metadata": {},
204
- "output_type": "execute_result"
205
- }
206
- ],
207
  "source": [
208
  "%%time \n",
209
  "# match CA Nature schema \n",
@@ -241,7 +208,7 @@
241
  },
242
  {
243
  "cell_type": "code",
244
- "execution_count": 4,
245
  "id": "a3d4f189-1563-4868-9f1f-64d67569df27",
246
  "metadata": {},
247
  "outputs": [],
@@ -298,7 +265,7 @@
298
  },
299
  {
300
  "cell_type": "code",
301
- "execution_count": 5,
302
  "id": "a59c976b-3c36-40f9-a15b-cefcd155c647",
303
  "metadata": {},
304
  "outputs": [],
@@ -344,58 +311,10 @@
344
  },
345
  {
346
  "cell_type": "code",
347
- "execution_count": 6,
348
  "id": "4d6177e2-8ece-4eb9-acc2-5fb5c5beb8bb",
349
  "metadata": {},
350
- "outputs": [
351
- {
352
- "data": {
353
- "application/vnd.jupyter.widget-view+json": {
354
- "model_id": "09f24f1359a84ae2a4b69360cc8e852b",
355
- "version_major": 2,
356
- "version_minor": 0
357
- },
358
- "text/plain": [
359
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
360
- ]
361
- },
362
- "metadata": {},
363
- "output_type": "display_data"
364
- },
365
- {
366
- "data": {
367
- "application/vnd.jupyter.widget-view+json": {
368
- "model_id": "c10ce980d24e45b6bad9b8a70c176f2c",
369
- "version_major": 2,
370
- "version_minor": 0
371
- },
372
- "text/plain": [
373
- "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
374
- ]
375
- },
376
- "metadata": {},
377
- "output_type": "display_data"
378
- },
379
- {
380
- "name": "stderr",
381
- "output_type": "stream",
382
- "text": [
383
- "/opt/conda/lib/python3.12/site-packages/ibis/common/deferred.py:408: FutureWarning: `Value.case` is deprecated as of v10.0.0; use value.cases() or ibis.cases()\n",
384
- " return func(*args, **kwargs)\n"
385
- ]
386
- },
387
- {
388
- "ename": "NameError",
389
- "evalue": "name 'non_conserved' is not defined",
390
- "output_type": "error",
391
- "traceback": [
392
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
393
- "\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
394
- "\u001b[36mFile \u001b[39m\u001b[32m<timed exec>:50\u001b[39m\n",
395
- "\u001b[31mNameError\u001b[39m: name 'non_conserved' is not defined"
396
- ]
397
- }
398
- ],
399
  "source": [
400
  "%%time \n",
401
  "counties = con.read_parquet('../CA_counties.parquet')\n",
@@ -454,7 +373,7 @@
454
  "gdf = all_data.execute()\n",
455
  "\n",
456
  "gdf.set_crs(\"epsg:3310\").to_parquet(ca_base_parquet)\n",
457
- "s3.fput_object(\"public-ca30x30\", 'Preprocessing/'+ca_base_parquet, ca_base_parquet) "
458
  ]
459
  },
460
  {
@@ -485,7 +404,7 @@
485
  "\n",
486
  "def get_habitat_type(fieldname):\n",
487
  " aux_xml_path = 'fveg22_1_processed.tif.aux.xml'\n",
488
- " s3.fget_object('public-ca30x30','CBN-data/Habitat/'+aux_xml_path, aux_xml_path)\n",
489
  " tree = ET.parse(aux_xml_path)\n",
490
  " root = tree.find(\".//GDALRasterAttributeTable\")\n",
491
  " field_names = [f.find(\"Name\").text for f in root.findall(\"FieldDefn\")]\n",
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": null,
14
  "id": "f7e6298c-d886-432a-a1b7-c3fee914c24f",
15
  "metadata": {
16
  "editable": true,
 
48
  },
49
  {
50
  "cell_type": "code",
51
+ "execution_count": null,
52
  "id": "63dd33b8-6d3c-4852-9899-6ed5775d19c0",
53
  "metadata": {},
54
  "outputs": [],
55
  "source": [
56
+ "def get_url(folder, file, base_folder = 'CBN'):\n",
57
  " minio = 'https://minio.carlboettiger.info/'\n",
58
  " bucket = 'public-ca30x30'\n",
59
  " if base_folder is None:\n",
 
80
  },
81
  {
82
  "cell_type": "code",
83
+ "execution_count": null,
84
  "id": "13214bbe-3a74-4247-981f-5a6eb6c486f5",
85
  "metadata": {},
86
  "outputs": [],
 
90
  "# ca_raw_parquet = 'ca_areas.parquet'\n",
91
  "\n",
92
  "# Boundary of CA, used to computed 'non-conserved' areas\n",
93
+ "ca_boundary_parquet = get_url('CA_Nature/2024/Preprocessing','ca_boundary.parquet',base_folder = None)\n",
94
  "\n",
95
  "# newly protected areas \n",
96
  "newly_protected = get_url('Progress_data_new_protection/Newly_counted_lands','newly_counted_lands_2024.parquet')\n",
 
167
  },
168
  {
169
  "cell_type": "code",
170
+ "execution_count": null,
171
  "id": "0f9666d1-7c2b-45af-9399-e4189bba34f5",
172
  "metadata": {},
173
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  "source": [
175
  "%%time \n",
176
  "# match CA Nature schema \n",
 
208
  },
209
  {
210
  "cell_type": "code",
211
+ "execution_count": null,
212
  "id": "a3d4f189-1563-4868-9f1f-64d67569df27",
213
  "metadata": {},
214
  "outputs": [],
 
265
  },
266
  {
267
  "cell_type": "code",
268
+ "execution_count": null,
269
  "id": "a59c976b-3c36-40f9-a15b-cefcd155c647",
270
  "metadata": {},
271
  "outputs": [],
 
311
  },
312
  {
313
  "cell_type": "code",
314
+ "execution_count": null,
315
  "id": "4d6177e2-8ece-4eb9-acc2-5fb5c5beb8bb",
316
  "metadata": {},
317
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  "source": [
319
  "%%time \n",
320
  "counties = con.read_parquet('../CA_counties.parquet')\n",
 
373
  "gdf = all_data.execute()\n",
374
  "\n",
375
  "gdf.set_crs(\"epsg:3310\").to_parquet(ca_base_parquet)\n",
376
+ "s3.fput_object(\"public-ca30x30\", 'CA_Nature/2024/Preprocessing/'+ca_base_parquet, ca_base_parquet) "
377
  ]
378
  },
379
  {
 
404
  "\n",
405
  "def get_habitat_type(fieldname):\n",
406
  " aux_xml_path = 'fveg22_1_processed.tif.aux.xml'\n",
407
+ " s3.fget_object('public-ca30x30','CBN/Habitat/'+aux_xml_path, aux_xml_path)\n",
408
  " tree = ET.parse(aux_xml_path)\n",
409
  " root = tree.find(\".//GDALRasterAttributeTable\")\n",
410
  " field_names = [f.find(\"Name\").text for f in root.findall(\"FieldDefn\")]\n",
preprocess/utils.py CHANGED
@@ -15,7 +15,7 @@ from shapely.geometry import shape
15
  import numpy as np
16
 
17
 
18
- def info(folder, file, bucket = "public-ca30x30", base_folder = 'CBN-data/'):
19
  """
20
  Extract minio path to upload/download data
21
  """
@@ -77,22 +77,10 @@ def process_raster(s3, folder, file, file_name = None):
77
  """
78
  if file_name:
79
  file = file_name
80
- # output_file = reproject_raster(file)
81
- # upload(s3, folder, output_file)
82
- # output_cog_file = make_cog(output_file)
83
- # upload(s3, folder, output_cog_file)
84
- # output_vector, cols = make_vector(output_file)
85
- # upload(s3, folder, output_vector)
86
-
87
  name, ext = os.path.splitext(file)
88
  output_file = f"{name}_processed{ext}"
89
-
90
  output_cog_file = f"{name}_processed_COG{ext}"
91
-
92
  output_vector_file = f"{name}_processed.parquet"
93
- print(output_file)
94
- print(output_cog_file)
95
- print(output_vector_file)
96
  # Reproject raster
97
  if not exists_on_s3(s3, folder, output_file):
98
  output_file = reproject_raster(file)
@@ -183,7 +171,6 @@ def make_vector(input_file, crs="EPSG:4326"):
183
  gdf.to_crs(crs, inplace=True)
184
 
185
  gdf.to_parquet(output_file)
186
- print(gdf)
187
  return output_file, gdf.drop('geom',axis = 1).columns.to_list()
188
 
189
  def filter_raster(s3, folder, file, percentile):
@@ -226,9 +213,6 @@ def exists_on_s3(s3, folder, file):
226
  Check if a file exists on S3
227
  """
228
  bucket, path = info(folder, file)
229
- print(bucket)
230
- print(path)
231
-
232
  try:
233
  s3.stat_object(bucket, path)
234
  return True
 
15
  import numpy as np
16
 
17
 
18
+ def info(folder, file, bucket = "public-ca30x30", base_folder = 'CBN/'):
19
  """
20
  Extract minio path to upload/download data
21
  """
 
77
  """
78
  if file_name:
79
  file = file_name
 
 
 
 
 
 
 
80
  name, ext = os.path.splitext(file)
81
  output_file = f"{name}_processed{ext}"
 
82
  output_cog_file = f"{name}_processed_COG{ext}"
 
83
  output_vector_file = f"{name}_processed.parquet"
 
 
 
84
  # Reproject raster
85
  if not exists_on_s3(s3, folder, output_file):
86
  output_file = reproject_raster(file)
 
171
  gdf.to_crs(crs, inplace=True)
172
 
173
  gdf.to_parquet(output_file)
 
174
  return output_file, gdf.drop('geom',axis = 1).columns.to_list()
175
 
176
  def filter_raster(s3, folder, file, percentile):
 
213
  Check if a file exists on S3
214
  """
215
  bucket, path = info(folder, file)
 
 
 
216
  try:
217
  s3.stat_object(bucket, path)
218
  return True