cassiebuhler commited on
Commit
db54b60
·
1 Parent(s): d67e420

wip; h3!!!

Browse files
preprocess/CBN-data.ipynb CHANGED
@@ -15,9 +15,15 @@
15
  "metadata": {},
16
  "outputs": [],
17
  "source": [
18
- "from cng.utils import set_secrets, s3_client, to_pmtiles\n",
 
 
19
  "s3 = s3_client()\n",
20
  "\n",
 
 
 
 
21
  "import zipfile\n",
22
  "import os\n",
23
  "import subprocess\n",
@@ -26,140 +32,15 @@
26
  "import geopandas as gpd\n",
27
  "import ibis\n",
28
  "from ibis import _\n",
29
- "con = ibis.duckdb.connect(extensions=[\"spatial\"])\n",
 
30
  "\n",
31
  "import rasterio\n",
 
 
32
  "import numpy as np"
33
  ]
34
  },
35
- {
36
- "cell_type": "markdown",
37
- "id": "a6e4eac1-9bde-4d83-a386-60a055dfe402",
38
- "metadata": {},
39
- "source": [
40
- "#### Helper functions"
41
- ]
42
- },
43
- {
44
- "cell_type": "code",
45
- "execution_count": null,
46
- "id": "0fc80b2e-375a-4295-95cf-5bfc313b78fd",
47
- "metadata": {},
48
- "outputs": [],
49
- "source": [
50
- "def info(folder, file, bucket = \"public-ca30x30\", base_folder = 'CBN-data/'):\n",
51
- " path = os.path.join(base_folder, folder, file)\n",
52
- " return bucket, path \n",
53
- " \n",
54
- "def download(folder, file, file_name = None):\n",
55
- " if not file_name: \n",
56
- " file_name = file\n",
57
- " bucket, path = info(folder, file)\n",
58
- " s3.fget_object(bucket, path ,file_name) \n",
59
- " return\n",
60
- "\n",
61
- "def upload(folder, file):\n",
62
- " bucket, path = info(folder, file)\n",
63
- " s3.fput_object(bucket, path ,file) \n",
64
- " return\n",
65
- "\n",
66
- "def unzip(folder, file):\n",
67
- " download(folder, file)\n",
68
- " with zipfile.ZipFile(file, 'r') as zip_ref:\n",
69
- " zip_ref.extractall()\n",
70
- " return \n",
71
- "\n",
72
- "def upload_parquet(folder, file, gdf):\n",
73
- " name, ext = os.path.splitext(file)\n",
74
- " parquet_file = f\"{name}{'.parquet'}\"\n",
75
- " gdf.to_parquet(parquet_file)\n",
76
- " upload(folder, parquet_file)\n",
77
- " return \n",
78
- "\n",
79
- "def process_vector(folder, file, file_name = None, gdf = None, crs=\"EPSG:3310\"):\n",
80
- " if gdf is None:\n",
81
- " gdf = gpd.read_file(file)\n",
82
- " gdf = gdf.to_crs(crs)\n",
83
- " gdf = gdf.rename_geometry('geom')\n",
84
- " if file_name:\n",
85
- " file = file_name\n",
86
- " upload_parquet(folder, file, gdf)\n",
87
- " return \n",
88
- "\n",
89
- "def reproject_raster(input_file, crs=\"EPSG:3310\"):\n",
90
- " suffix = '_processed'\n",
91
- " name, ext = os.path.splitext(input_file)\n",
92
- " output_file = f\"{name}{suffix}{ext}\"\n",
93
- " command = [\n",
94
- " \"gdalwarp\",\n",
95
- " \"-t_srs\", crs,\n",
96
- " input_file,\n",
97
- " output_file \n",
98
- " ]\n",
99
- " try:\n",
100
- " subprocess.run(command, check=True)\n",
101
- " print(f\"Reprojection successful!\")\n",
102
- " except subprocess.CalledProcessError as e:\n",
103
- " print(f\"Error occurred during reprojection: {e}\")\n",
104
- " return output_file \n",
105
- "\n",
106
- "def make_cog(input_file, crs=\"EPSG:4326\"):\n",
107
- " suffix = '_COG'\n",
108
- " name, ext = os.path.splitext(input_file)\n",
109
- " output_file = f\"{name}{suffix}{ext}\"\n",
110
- " command = [\n",
111
- " \"gdalwarp\",\n",
112
- " \"-t_srs\", crs,\n",
113
- " \"-of\", \"COG\",\n",
114
- " input_file,\n",
115
- " output_file \n",
116
- " ]\n",
117
- " try:\n",
118
- " subprocess.run(command, check=True)\n",
119
- " print(f\"Successful!\")\n",
120
- " except subprocess.CalledProcessError as e:\n",
121
- " print(f\"Error occurred during processing: {e}\")\n",
122
- " return output_file \n",
123
- "\n",
124
- "def process_raster(folder, file, file_name = None):\n",
125
- " if file_name:\n",
126
- " file = file_name\n",
127
- " output_file = reproject_raster(file)\n",
128
- " upload(folder, output_file)\n",
129
- " output_cog_file = make_cog(output_file)\n",
130
- " upload(folder, output_cog_file)\n",
131
- " return\n",
132
- "\n",
133
- "def filter_raster(folder, file, percentile):\n",
134
- " with rasterio.open(file) as src:\n",
135
- " data = src.read(1) # Read the first band\n",
136
- " profile = src.profile\n",
137
- " \n",
138
- " # mask no data values\n",
139
- " masked_data = np.ma.masked_equal(data, src.nodata)\n",
140
- "\n",
141
- " # compute percentile/threshold \n",
142
- " p = np.percentile(masked_data.compressed(),percentile)\n",
143
- " filtered = np.where(data >= p, data, src.nodata)\n",
144
- " \n",
145
- " name, ext = os.path.splitext(file)\n",
146
- " new_file = f\"{name}{'_'}{percentile}{'percentile'}{ext}\"\n",
147
- "\n",
148
- " profile.update(dtype=rasterio.float64)\n",
149
- " with rasterio.open(new_file, \"w\", **profile) as dst:\n",
150
- " dst.write(filtered, 1)\n",
151
- " \n",
152
- " process_raster(folder, file)\n",
153
- " return\n",
154
- "\n",
155
- "def convert_pmtiles(folder, parquet_file):\n",
156
- " name, ext = os.path.splitext(parquet_file)\n",
157
- " con.read_parquet(parquet_file).execute().set_crs('epsg:3310').to_crs('epsg:4326').to_file(name+'.geojson')\n",
158
- " to_pmtiles(name+'.geojson', name+'.pmtiles', options = ['--extend-zooms-if-still-dropping'])\n",
159
- " upload(folder, name+'.pmtiles')\n",
160
- " return"
161
- ]
162
- },
163
  {
164
  "cell_type": "markdown",
165
  "id": "2219c881-1017-4def-9d0c-c83b5541b5d2",
@@ -175,8 +56,18 @@
175
  "metadata": {},
176
  "outputs": [],
177
  "source": [
178
- "unzip(folder = 'Counties', file = '30x30_Counties.zip')\n",
179
- "process_vector(folder = 'Counties', file = 'CA_counties.shp')"
 
 
 
 
 
 
 
 
 
 
180
  ]
181
  },
182
  {
@@ -194,8 +85,15 @@
194
  "metadata": {},
195
  "outputs": [],
196
  "source": [
197
- "download(folder = 'Climate_zones', file = 'clusters_10.tif')\n",
198
- "process_raster(folder = 'Climate_zones', file = 'clusters_10.tif', file_name = 'climate_zones_10.tif')"
 
 
 
 
 
 
 
199
  ]
200
  },
201
  {
@@ -213,8 +111,17 @@
213
  "metadata": {},
214
  "outputs": [],
215
  "source": [
216
- "unzip(folder = 'Ecoregion', file = '30x30_Ecoregions.zip')\n",
217
- "process_vector(folder = 'Ecoregion', file = 'ACE_ecoregions.shp')"
 
 
 
 
 
 
 
 
 
218
  ]
219
  },
220
  {
@@ -241,7 +148,7 @@
241
  "outputs": [],
242
  "source": [
243
  "# download(folder = 'Habitat', file = 'CWHR13_2022.tif')\n",
244
- "# process_raster(folder = 'Habitat', file = 'CWHR13_2022.tif')"
245
  ]
246
  },
247
  {
@@ -251,18 +158,27 @@
251
  "metadata": {},
252
  "outputs": [],
253
  "source": [
254
- "unzip(folder = 'Habitat', file = 'fveg221gdb.zip')\n",
 
 
 
 
 
255
  "\n",
256
- "command = [\n",
257
- " \"gdalwarp\",\n",
258
- " \"-of\", \"GTiff\",\n",
259
- " 'fveg22_1.gdb',\n",
260
- " 'fveg22_1.tif' \n",
261
- " ]\n",
262
  "\n",
263
- "subprocess.run(command, check=True)\n",
264
- "process_raster(folder = 'Habitat', file = 'fveg22_1.tif')\n",
265
- "upload(folder = 'Habitat', file = 'fveg22_1_processed.tif.aux.xml')\n"
 
 
 
 
 
 
 
 
 
266
  ]
267
  },
268
  {
@@ -288,12 +204,18 @@
288
  "metadata": {},
289
  "outputs": [],
290
  "source": [
291
- "download(folder = 'ACE_biodiversity', file = 'Terrestrial_Biodiversity_Summary_-_ACE_[ds2739].geojson',\n",
292
- " file_name = 'ACE_biodiversity_all_ds2739.geojson')\n",
293
- "gdf = gpd.read_file('ACE_biodiversity_all_ds2739.geojson')\n",
 
 
 
 
 
 
294
  "\n",
295
- "to_pmtiles('ACE_biodiversity_all_ds2739.geojson', 'ACE_biodiversity_ds2739_all.pmtiles', options = ['--extend-zooms-if-still-dropping'])\n",
296
- "upload(folder = 'ACE_biodiversity',file = 'ACE_biodiversity_ds2739_all.pmtiles')"
297
  ]
298
  },
299
  {
@@ -323,7 +245,7 @@
323
  " cols.append(col) #select only the cols we want + the new col. \n",
324
  " rank_df = gdf[gdf[col]==5][cols]# filter ranks = 5\n",
325
  " process_vector(folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = rank_df)\n",
326
- " convert_pmtiles(folder ='ACE_biodiversity/'+name, parquet_file = name+'.parquet')\n"
327
  ]
328
  },
329
  {
@@ -362,7 +284,7 @@
362
  " threshold = gdf[col].quantile(percentile)\n",
363
  " ace = gdf[gdf[col]>=threshold][cols]\n",
364
  " process_vector(folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = ace)\n",
365
- " convert_pmtiles(folder ='ACE_biodiversity/'+name, parquet_file = name+'.parquet')\n",
366
  "\n",
367
  "\n",
368
  "# calculate 80% percentile, filter to those >= threshold. \n",
@@ -392,8 +314,16 @@
392
  "metadata": {},
393
  "outputs": [],
394
  "source": [
395
- "download(folder = 'Biodiversity_unique/Plant_richness', file = 'species_D.tif')\n",
396
- "filter_raster(folder = 'Biodiversity_unique/Plant_richness', file = 'species_D.tif', percentile = 80)\n"
 
 
 
 
 
 
 
 
397
  ]
398
  },
399
  {
@@ -411,10 +341,16 @@
411
  "metadata": {},
412
  "outputs": [],
413
  "source": [
414
- "download(folder = 'Biodiversity_unique/Rarityweighted_endemic_plant_richness', file = 'endemicspecies_E.tif')\n",
 
 
 
 
 
415
  "\n",
416
- "filter_raster(folder = 'Biodiversity_unique/Rarityweighted_endemic_plant_richness',\n",
417
- " file = 'endemicspecies_E.tif', percentile = 80)"
 
418
  ]
419
  },
420
  {
@@ -449,8 +385,15 @@
449
  "metadata": {},
450
  "outputs": [],
451
  "source": [
452
- "process_raster(folder = 'Connectivity_resilience/Resilient_connected_network_allcategories',\n",
453
- " file = 'rcn_wIntactBioCat_caOnly_2020-10-27.tif')"
 
 
 
 
 
 
 
454
  ]
455
  },
456
  {
@@ -524,14 +467,21 @@
524
  "metadata": {},
525
  "outputs": [],
526
  "source": [
527
- "unzip(folder = 'Freshwater_resources/Wetlands', file = 'CA_geodatabase_wetlands.zip')\n",
 
 
 
528
  "\n",
529
  "# only pick a subset \n",
 
530
  "gdf = gpd.read_file('CA_geodatabase_wetlands.gdb')\n",
531
  "wetlands = ['Freshwater Emergent Wetland', 'Freshwater Forested/Shrub Wetland', 'Estuarine and Marine Wetland']\n",
532
  "gdf = gdf[gdf['WETLAND_TYPE'].isin(wetlands)]\n",
533
- "process_vector(folder = 'Freshwater_resources/Wetlands', file = 'CA_wetlands.parquet', gdf = gdf)\n",
534
- "convert_pmtiles(folder ='Freshwater_resources/Wetlands', parquet_file ='CA_wetlands.parquet')\n"
 
 
 
535
  ]
536
  },
537
  {
@@ -625,23 +575,39 @@
625
  {
626
  "cell_type": "code",
627
  "execution_count": null,
628
- "id": "7e39ee64-3123-43f0-9552-319010708b34",
629
  "metadata": {},
630
  "outputs": [],
631
  "source": [
632
- "unzip(folder = 'NBS_agriculture/Farmland', file = 'Important_Farmland_2018.zip')\n",
 
 
 
 
 
 
 
 
 
633
  "\n",
634
  "# only pick a subset \n",
635
- "gdf = gpd.read_file('Important_Farmland_2018.gdb')\n",
636
- "farmland_type = ['P','S','L','U'] # prime, statewide importance, local importance, unique\n",
637
- "gdf_farmland = gdf[gdf['polygon_ty'].isin(farmland_type)]\n",
638
- "process_vector(folder = 'NBS_agriculture/Farmland', file = 'Farmland_2018.parquet', gdf = gdf_farmland)\n",
639
- "convert_pmtiles(folder ='NBS_agriculture/Farmland', parquet_file ='Farmland_2018.parquet')\n",
 
 
 
640
  "\n",
641
- "gdf_grazing = gdf[gdf['polygon_ty'] == 'G']\n",
642
- "process_vector(folder = 'NBS_agriculture/Lands_suitable_grazing', \n",
643
- " file = 'Grazing_land_2018.parquet', gdf = gdf_grazing)\n",
644
- "convert_pmtiles(folder ='NBS_agriculture/Lands_suitable_grazing', parquet_file ='Grazing_land_2018.parquet')\n"
 
 
 
 
645
  ]
646
  },
647
  {
@@ -678,6 +644,14 @@
678
  "Only YEAR >= 2014. "
679
  ]
680
  },
 
 
 
 
 
 
 
 
681
  {
682
  "cell_type": "code",
683
  "execution_count": null,
@@ -685,14 +659,19 @@
685
  "metadata": {},
686
  "outputs": [],
687
  "source": [
688
- "unzip(folder = 'Climate_risks/Historical_fire_perimeters', file = 'fire23-1gdb.zip')\n",
 
 
 
 
689
  "gdf = gpd.read_file('fire23_1.gdb')\n",
690
  "gdf = gdf[~gdf['YEAR_'].isna()]\n",
691
  "gdf['YEAR_'] = gdf['YEAR_'].astype('int64')\n",
692
- "gdf = gdf[gdf['YEAR_']>=2014]\n",
 
 
693
  "\n",
694
- "process_vector(folder = 'Climate_risks/Historical_fire_perimeters', file = 'calfire_2023.parquet', gdf = gdf)\n",
695
- "convert_pmtiles(folder ='Climate_risks/Historical_fire_perimeters', parquet_file ='calfire_2023.parquet')\n"
696
  ]
697
  },
698
  {
@@ -769,9 +748,14 @@
769
  "metadata": {},
770
  "outputs": [],
771
  "source": [
772
- "unzip(folder = 'Progress_data_new_protection/Newly_counted_lands', file = 'newly_counted_lands_2024.shp.zip')\n",
773
- "process_vector(folder = 'Progress_data_new_protection/Newly_counted_lands', file = 'newly_counted_lands_2024.shp')\n",
774
- "convert_pmtiles(folder ='Progress_data_new_protection/Newly_counted_lands', parquet_file ='newly_counted_lands_2024.parquet')\n"
 
 
 
 
 
775
  ]
776
  },
777
  {
@@ -789,10 +773,14 @@
789
  "metadata": {},
790
  "outputs": [],
791
  "source": [
792
- "unzip(folder = 'Progress_data_new_protection/DAC', file = 'sb535dacgdbf2022gdb.zip')\n",
793
- "process_vector(folder = 'Progress_data_new_protection/DAC', file = 'SB535DACgdb_F_2022.gdb',\n",
794
- " file_name = 'DAC_2022.parquet')\n",
795
- "convert_pmtiles(folder ='Progress_data_new_protection/DAC', parquet_file ='DAC_2022.parquet')\n"
 
 
 
 
796
  ]
797
  },
798
  {
@@ -809,7 +797,42 @@
809
  "id": "fc236b85-9fc4-4589-8dd9-5efb7f2e9614",
810
  "metadata": {},
811
  "outputs": [],
812
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
813
  },
814
  {
815
  "cell_type": "markdown",
@@ -826,16 +849,15 @@
826
  "metadata": {},
827
  "outputs": [],
828
  "source": [
829
- "unzip(folder = 'Progress_data_new_protection/Low_income_communities',\n",
830
- " file = 'Priority Populations 4.0 Geodatabase.zip')\n",
 
 
831
  "\n",
832
  "gdf = gpd.read_file('Priority Populations 4.0 Combined Layer.gdb')\n",
833
  "gdf = gdf[gdf['Designatio'] =='Low-income community']\n",
834
- "\n",
835
- "\n",
836
- "process_vector(folder = 'Progress_data_new_protection/Low_income_communities', \n",
837
- " file = 'low_income_CalEnviroScreen4.parquet',gdf = gdf)\n",
838
- "convert_pmtiles(folder ='Progress_data_new_protection/Low_income_communities', parquet_file ='low_income_CalEnviroScreen4.parquet')\n"
839
  ]
840
  },
841
  {
@@ -853,12 +875,117 @@
853
  "metadata": {},
854
  "outputs": [],
855
  "source": [
856
- "unzip(folder = 'Progress_data_new_protection/Land_Status_Zone_Ecoregion_Counties',\n",
857
- " file = 'Land_Status_Zone_Ecoregion_Counties.shp.zip')\n",
858
- "process_vector(folder = 'Progress_data_new_protection/Land_Status_Zone_Ecoregion_Counties', file = 'Land_Status_Zone_Ecoregion_Counties.shp',\n",
859
- " file_name = 'all_regions_reGAP_county_eco.parquet')\n",
860
- "convert_pmtiles(folder ='Progress_data_new_protection/Land_Status_Zone_Ecoregion_Counties', parquet_file ='all_regions_reGAP_county_eco.parquet')\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
861
  ]
 
 
 
 
 
 
 
 
862
  }
863
  ],
864
  "metadata": {
@@ -877,7 +1004,7 @@
877
  "name": "python",
878
  "nbconvert_exporter": "python",
879
  "pygments_lexer": "ipython3",
880
- "version": "3.12.8"
881
  }
882
  },
883
  "nbformat": 4,
 
15
  "metadata": {},
16
  "outputs": [],
17
  "source": [
18
+ "from cng.utils import *\n",
19
+ "from utils import *\n",
20
+ "from h3_utils import *\n",
21
  "s3 = s3_client()\n",
22
  "\n",
23
+ "duckdb_install_h3()\n",
24
+ "from minio.error import S3Error\n",
25
+ "\n",
26
+ "\n",
27
  "import zipfile\n",
28
  "import os\n",
29
  "import subprocess\n",
 
32
  "import geopandas as gpd\n",
33
  "import ibis\n",
34
  "from ibis import _\n",
35
+ "# con = ibis.duckdb.connect('temp',extensions = [\"spatial\", \"h3\"])\n",
36
+ "# set_secrets(con)\n",
37
  "\n",
38
  "import rasterio\n",
39
+ "from rasterio.features import shapes\n",
40
+ "from shapely.geometry import shape\n",
41
  "import numpy as np"
42
  ]
43
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  {
45
  "cell_type": "markdown",
46
  "id": "2219c881-1017-4def-9d0c-c83b5541b5d2",
 
56
  "metadata": {},
57
  "outputs": [],
58
  "source": [
59
+ "%%time \n",
60
+ "con = ibis.duckdb.connect('counties',extensions = [\"spatial\", \"h3\"])\n",
61
+ "set_secrets(con)\n",
62
+ "\n",
63
+ "folder = 'Counties'\n",
64
+ "name = 'CA_counties'\n",
65
+ "\n",
66
+ "# unzip(folder = folder, file = '30x30_Counties.zip')\n",
67
+ "# process_vector(folder = folder, file = f\"{name}.shp\")\n",
68
+ "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
69
+ "\n",
70
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= \"COUNTY_NAM\")"
71
  ]
72
  },
73
  {
 
85
  "metadata": {},
86
  "outputs": [],
87
  "source": [
88
+ "%%time \n",
89
+ "con = ibis.duckdb.connect('climate_zones',extensions = [\"spatial\", \"h3\"])\n",
90
+ "set_secrets(con)\n",
91
+ "\n",
92
+ "folder = 'Climate_zones'\n",
93
+ "name = 'climate_zones_10'\n",
94
+ "# download(folder = folder, file = 'clusters_10.tif')\n",
95
+ "# process_raster(s3, folder = folder, file = 'clusters_10.tif', file_name = f\"{name}.tif\")\n",
96
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
97
  ]
98
  },
99
  {
 
111
  "metadata": {},
112
  "outputs": [],
113
  "source": [
114
+ "%%time \n",
115
+ "con = ibis.duckdb.connect('ecoregion',extensions = [\"spatial\", \"h3\"])\n",
116
+ "set_secrets(con)\n",
117
+ "\n",
118
+ "folder = 'Ecoregion'\n",
119
+ "name = 'ACE_ecoregions'\n",
120
+ "\n",
121
+ "unzip(folder = folder, file = '30x30_Ecoregions.zip')\n",
122
+ "process_vector(folder = folder, file = f\"{name}.shp\")\n",
123
+ "\n",
124
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= 'CA_Ecoregi')"
125
  ]
126
  },
127
  {
 
148
  "outputs": [],
149
  "source": [
150
  "# download(folder = 'Habitat', file = 'CWHR13_2022.tif')\n",
151
+ "# process_raster(s3, folder = 'Habitat', file = 'CWHR13_2022.tif')"
152
  ]
153
  },
154
  {
 
158
  "metadata": {},
159
  "outputs": [],
160
  "source": [
161
+ "%%time\n",
162
+ "con = ibis.duckdb.connect('habitat',extensions = [\"spatial\", \"h3\"])\n",
163
+ "set_secrets(con)\n",
164
+ "\n",
165
+ "folder = 'Habitat'\n",
166
+ "name = 'fveg22_1'\n",
167
  "\n",
168
+ "# unzip(folder = folder, file = 'fveg221gdb.zip')\n",
 
 
 
 
 
169
  "\n",
170
+ "# command = [\n",
171
+ "# \"gdalwarp\",\n",
172
+ "# \"-of\", \"GTiff\",\n",
173
+ "# 'fveg22_1.gdb',\n",
174
+ "# 'fveg22_1.tif' \n",
175
+ "# ]\n",
176
+ "\n",
177
+ "# subprocess.run(command, check=True)\n",
178
+ "process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
179
+ "upload(folder = folder, file = f'{name}_processed.tif.aux.xml')\n",
180
+ "\n",
181
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
182
  ]
183
  },
184
  {
 
204
  "metadata": {},
205
  "outputs": [],
206
  "source": [
207
+ "%%time \n",
208
+ "folder = 'ACE_biodiversity'\n",
209
+ "name = 'ACE_terrestrial_biodiversity_summary_ds2739'\n",
210
+ "\n",
211
+ "download(folder = folder, file = 'Terrestrial_Biodiversity_Summary_-_ACE_[ds2739].geojson',\n",
212
+ " file_name = f\"{name}.geojson\")\n",
213
+ "\n",
214
+ "process_vector(folder = folder, file = f\"{name}.geojson\")\n",
215
+ "# convert_pmtiles(folder = folder, file = f\"{name}.geojson\")\n",
216
  "\n",
217
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= \"OBJECTID\")\n",
218
+ "# gdf = gpd.read_parquet(f\"{name}.parquet\")\n"
219
  ]
220
  },
221
  {
 
245
  " cols.append(col) #select only the cols we want + the new col. \n",
246
  " rank_df = gdf[gdf[col]==5][cols]# filter ranks = 5\n",
247
  " process_vector(folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = rank_df)\n",
248
+ " convert_pmtiles(folder ='ACE_biodiversity/'+name, file = name+'.parquet')\n"
249
  ]
250
  },
251
  {
 
284
  " threshold = gdf[col].quantile(percentile)\n",
285
  " ace = gdf[gdf[col]>=threshold][cols]\n",
286
  " process_vector(folder = 'ACE_biodiversity/'+name, file = name+'.parquet',gdf = ace)\n",
287
+ " convert_pmtiles(folder ='ACE_biodiversity/'+name, file = name+'.parquet')\n",
288
  "\n",
289
  "\n",
290
  "# calculate 80% percentile, filter to those >= threshold. \n",
 
314
  "metadata": {},
315
  "outputs": [],
316
  "source": [
317
+ "%%time \n",
318
+ "con = ibis.duckdb.connect('plant',extensions = [\"spatial\", \"h3\"])\n",
319
+ "set_secrets(con)\n",
320
+ "\n",
321
+ "folder = 'Biodiversity_unique/Plant_richness'\n",
322
+ "name = 'species_D'\n",
323
+ "\n",
324
+ "download(folder = folder, file = f\"{name}.tif\")\n",
325
+ "filter_raster(folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
326
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
327
  ]
328
  },
329
  {
 
341
  "metadata": {},
342
  "outputs": [],
343
  "source": [
344
+ "%%time \n",
345
+ "con = ibis.duckdb.connect('end_plant',extensions = [\"spatial\", \"h3\"])\n",
346
+ "set_secrets(con)\n",
347
+ "\n",
348
+ "folder = 'Biodiversity_unique/Rarityweighted_endemic_plant_richness'\n",
349
+ "name = 'endemicspecies_E'\n",
350
  "\n",
351
+ "download(folder = folder, file = f\"{name}.tif\")\n",
352
+ "filter_raster(folder = folder, file = f\"{name}.tif\", percentile = 80)\n",
353
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
354
  ]
355
  },
356
  {
 
385
  "metadata": {},
386
  "outputs": [],
387
  "source": [
388
+ "%%time \n",
389
+ "con = ibis.duckdb.connect('CRN',extensions = [\"spatial\", \"h3\"])\n",
390
+ "set_secrets(con)\n",
391
+ "\n",
392
+ "folder = 'Connectivity_resilience/Resilient_connected_network_allcategories'\n",
393
+ "name = 'rcn_wIntactBioCat_caOnly_2020-10-27'\n",
394
+ "\n",
395
+ "process_raster(s3, folder = folder, file = f\"{name}.tif\")\n",
396
+ "convert_h3(con, s3, folder = folder, file = f\"{name}_processed.parquet\", cols= \"id\")"
397
  ]
398
  },
399
  {
 
467
  "metadata": {},
468
  "outputs": [],
469
  "source": [
470
+ "%%time \n",
471
+ "\n",
472
+ "folder = 'Freshwater_resources/Wetlands'\n",
473
+ "name = 'CA_wetlands'\n",
474
  "\n",
475
  "# only pick a subset \n",
476
+ "unzip(folder = folder, file = 'CA_geodatabase_wetlands.zip')\n",
477
  "gdf = gpd.read_file('CA_geodatabase_wetlands.gdb')\n",
478
  "wetlands = ['Freshwater Emergent Wetland', 'Freshwater Forested/Shrub Wetland', 'Estuarine and Marine Wetland']\n",
479
  "gdf = gdf[gdf['WETLAND_TYPE'].isin(wetlands)]\n",
480
+ "\n",
481
+ "process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
482
+ "# convert_pmtiles(folder =folder, file = f\"{name}.parquet\")\n",
483
+ "geom_to_h3(con, folder = folder, file = f\"{name}.parquet\", cols= ['ATTRIBUTE','WETLAND_TYPE','NWI_ID'])\n",
484
+ "\n"
485
  ]
486
  },
487
  {
 
575
  {
576
  "cell_type": "code",
577
  "execution_count": null,
578
+ "id": "fdf7061f-7598-4303-bb77-38f836feac8a",
579
  "metadata": {},
580
  "outputs": [],
581
  "source": [
582
+ "%%time \n",
583
+ "\n",
584
+ "folder = 'NBS_agriculture/Farmland'\n",
585
+ "unzip(folder = folder, file = 'Important_Farmland_2018.zip')\n",
586
+ "\n",
587
+ "folder = 'NBS_agriculture/Farmland_all'\n",
588
+ "name = 'Important_Farmland_2018'\n",
589
+ "process_vector(folder = folder, file = f\"{name}.gdb\")\n",
590
+ "# convert_pmtiles(folder = folder, file =f\"{name}.parquet\")\n",
591
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= ['county_nam','polygon_ty'])\n",
592
  "\n",
593
  "# only pick a subset \n",
594
+ "folder = 'NBS_agriculture/Farmland_all/Farmland'\n",
595
+ "name = 'Farmland_2018'\n",
596
+ "# gdf = gpd.read_file('Important_Farmland_2018.gdb')\n",
597
+ "# farmland_type = ['P','S','L','U'] # prime, statewide importance, local importance, unique\n",
598
+ "# gdf_farmland = gdf[gdf['polygon_ty'].isin(farmland_type)]\n",
599
+ "# process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf_farmland)\n",
600
+ "# convert_pmtiles(folder = folder, file =f\"{name}.parquet\")\n",
601
+ "\n",
602
  "\n",
603
+ "\n",
604
+ "# grazing lands \n",
605
+ "folder = 'NBS_agriculture/Farmland_all/Lands_suitable_grazing'\n",
606
+ "name = 'Grazing_land_2018'\n",
607
+ "\n",
608
+ "# gdf_grazing = gdf[gdf['polygon_ty'] == 'G']\n",
609
+ "# process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf_grazing)\n",
610
+ "# convert_pmtiles(folder = folder, file =f\"{name}.parquet\")\n"
611
  ]
612
  },
613
  {
 
644
  "Only YEAR >= 2014. "
645
  ]
646
  },
647
+ {
648
+ "cell_type": "code",
649
+ "execution_count": null,
650
+ "id": "425f9149-d8ac-437a-9572-301bd1b1bec8",
651
+ "metadata": {},
652
+ "outputs": [],
653
+ "source": []
654
+ },
655
  {
656
  "cell_type": "code",
657
  "execution_count": null,
 
659
  "metadata": {},
660
  "outputs": [],
661
  "source": [
662
+ "%%time \n",
663
+ "folder = 'Climate_risks/Historical_fire_perimeters'\n",
664
+ "name = 'calfire_2023'\n",
665
+ "\n",
666
+ "unzip(folder = folder, file = 'fire23-1gdb.zip')\n",
667
  "gdf = gpd.read_file('fire23_1.gdb')\n",
668
  "gdf = gdf[~gdf['YEAR_'].isna()]\n",
669
  "gdf['YEAR_'] = gdf['YEAR_'].astype('int64')\n",
670
+ "# gdf = gdf[gdf['YEAR_']>=2014]\n",
671
+ "process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
672
+ "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
673
  "\n",
674
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= ['INC_NUM','FIRE_NAME','YEAR_'])"
 
675
  ]
676
  },
677
  {
 
748
  "metadata": {},
749
  "outputs": [],
750
  "source": [
751
+ "%%time \n",
752
+ "folder = 'Progress_data_new_protection/Newly_counted_lands'\n",
753
+ "name = 'newly_counted_lands_2024'\n",
754
+ "\n",
755
+ "unzip(folder = folder, file = f\"{name}.shp.zip\")\n",
756
+ "process_vector(folder = folder, file = f\"{name}.shp\")\n",
757
+ "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
758
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= ['ORIG_FID'])\n"
759
  ]
760
  },
761
  {
 
773
  "metadata": {},
774
  "outputs": [],
775
  "source": [
776
+ "%%time \n",
777
+ "folder = 'Progress_data_new_protection/DAC'\n",
778
+ "name = 'DAC_2022'\n",
779
+ "\n",
780
+ "unzip(folder = folder, file = 'sb535dacgdbf2022gdb.zip')\n",
781
+ "process_vector(folder = folder, file = 'SB535DACgdb_F_2022.gdb', file_name = f\"{name}.parquet\")\n",
782
+ "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
783
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= ['GEOID'])\n"
784
  ]
785
  },
786
  {
 
797
  "id": "fc236b85-9fc4-4589-8dd9-5efb7f2e9614",
798
  "metadata": {},
799
  "outputs": [],
800
+ "source": [
801
+ "%%time \n",
802
+ "con = ibis.duckdb.connect('priority_pop',extensions = [\"spatial\", \"h3\"])\n",
803
+ "set_secrets(con)\n",
804
+ "\n",
805
+ "folder = 'Progress_data_new_protection/Priority_populations'\n",
806
+ "name = 'CalEnviroScreen4'\n",
807
+ "unzip(folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
808
+ "\n",
809
+ "gdf = (con.read_geo('Priority Populations 4.0 Combined Layer.gdb')\n",
810
+ " .mutate(id=ibis.row_number().over()) #making a unique id \n",
811
+ " ).execute().set_crs('EPSG:3857')\n",
812
+ "\n",
813
+ "process_vector(folder = folder, file = 'Priority Populations 4.0 Combined Layer.gdb',\n",
814
+ " file_name = f\"{name}.parquet\", gdf = gdf)\n",
815
+ "\n",
816
+ "# convert_pmtiles(folder = folder, file = f\"{name}.parquet\")\n",
817
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= [\"id\"])\n"
818
+ ]
819
+ },
820
+ {
821
+ "cell_type": "code",
822
+ "execution_count": null,
823
+ "id": "e64129da-f369-425f-afcc-bc595a89fb7d",
824
+ "metadata": {},
825
+ "outputs": [],
826
+ "source": [
827
+ "file = f\"{name}.parquet\"\n",
828
+ "folder = 'Progress_data_new_protection/Priority_populations'\n",
829
+ "name = 'CalEnviroScreen4'\n",
830
+ "bucket, path = info(folder, file)\n",
831
+ "# path, file = os.path.split(path)\n",
832
+ "# name, ext = os.path.splitext(file)\n",
833
+ "# join_chunked(bucket, path, file)\n",
834
+ "con.read_parquet(f\"s3://{bucket}/{folder}/hex/{file}_part_000.parquet\").head(10).execute()"
835
+ ]
836
  },
837
  {
838
  "cell_type": "markdown",
 
849
  "metadata": {},
850
  "outputs": [],
851
  "source": [
852
+ "folder = 'Progress_data_new_protection/Low_income_communities'\n",
853
+ "name = 'low_income_CalEnviroScreen4'\n",
854
+ "\n",
855
+ "unzip(folder = folder, file = 'Priority Populations 4.0 Geodatabase.zip')\n",
856
  "\n",
857
  "gdf = gpd.read_file('Priority Populations 4.0 Combined Layer.gdb')\n",
858
  "gdf = gdf[gdf['Designatio'] =='Low-income community']\n",
859
+ "process_vector(folder = folder, file = f\"{name}.parquet\", gdf = gdf)\n",
860
+ "convert_pmtiles(folder = folder, file = f\"{name}.parquet\")"
 
 
 
861
  ]
862
  },
863
  {
 
875
  "metadata": {},
876
  "outputs": [],
877
  "source": [
878
+ "folder = 'Progress_data_new_protection/Land_Status_Zone_Ecoregion_Counties'\n",
879
+ "name = 'all_regions_reGAP_county_eco'\n",
880
+ "\n",
881
+ "unzip(folder = folder, file = 'Land_Status_Zone_Ecoregion_Counties.shp.zip')\n",
882
+ "process_vector(folder = folder, file = 'Land_Status_Zone_Ecoregion_Counties.shp',\n",
883
+ " file_name = f\"{name}.parquet\")\n",
884
+ "convert_pmtiles(folder = folder, file = f\"{name}.parquet\")"
885
+ ]
886
+ },
887
+ {
888
+ "cell_type": "markdown",
889
+ "id": "df6e2e1e-b74f-4b14-8140-7e425a3dec20",
890
+ "metadata": {},
891
+ "source": [
892
+ "# CA Nature data"
893
+ ]
894
+ },
895
+ {
896
+ "cell_type": "code",
897
+ "execution_count": null,
898
+ "id": "ecc0f168-badd-4e4d-b97b-ee7891afaa4e",
899
+ "metadata": {},
900
+ "outputs": [],
901
+ "source": [
902
+ "# def convert_h3_2(con, folder, file, cols, zoom = \"8\"):\n",
903
+ "# cols = \", \".join(cols) if isinstance(cols,list) else cols #unpack columns \n",
904
+ "# bucket = 'public-ca30x30'\n",
905
+ "# name = 'ca-30x30-base'\n",
906
+ "# file = 'ca-30x30-base.parquet'\n",
907
+ "# folder = \"Preprocessing\"\n",
908
+ "# name= name.replace('-','')\n",
909
+ "\n",
910
+ "\n",
911
+ "# # reproject \n",
912
+ "# # (con.read_parquet(f\"s3://{bucket}/{file}\")\n",
913
+ "# # .mutate(geom = _.geom.convert('epsg:3310','epsg:4326'))\n",
914
+ "# # ).to_parquet(f\"s3://{bucket}/hex/{file}\")\n",
915
+ "\n",
916
+ "# con.read_parquet(f\"s3://{bucket}/{folder}/{file}\", table_name = name)\n",
917
+ "\n",
918
+ "# con.sql(f'''\n",
919
+ "# WITH t2 AS (\n",
920
+ "# WITH t1 AS (\n",
921
+ "# SELECT {cols}, ST_Dump(geom) AS geom \n",
922
+ "# FROM {name}\n",
923
+ "# ) \n",
924
+ "# SELECT {cols},\n",
925
+ "# h3_polygon_wkt_to_cells_string(UNNEST(geom).geom, {zoom}) AS h{zoom}\n",
926
+ "# FROM t1\n",
927
+ "# )\n",
928
+ "# SELECT *, UNNEST(h{zoom}) AS h{zoom} FROM t2\n",
929
+ "# ''').to_parquet(f\"s3://{bucket}/{folder}/hex/{file}\")"
930
+ ]
931
+ },
932
+ {
933
+ "cell_type": "code",
934
+ "execution_count": null,
935
+ "id": "16f9f330-c10c-4cec-9eba-0878aab9a5f7",
936
+ "metadata": {},
937
+ "outputs": [],
938
+ "source": [
939
+ "%%time \n",
940
+ "con = ibis.duckdb.connect('ca_30x30_base',extensions = [\"spatial\", \"h3\"])\n",
941
+ "set_secrets(con)\n",
942
+ "\n",
943
+ "# file = 'ca-30x30-base.parquet'\n",
944
+ "folder = \"Preprocessing\"\n",
945
+ "name = 'ca-30x30-base'\n",
946
+ "# download(folder = folder, file = f\"{name}.parquet\")\n",
947
+ "\n",
948
+ "# gdf = gpd.read_parquet(f\"{name}.parquet\")\n",
949
+ "process_vector(folder = folder, file = f\"{name}.parquet\")\n",
950
+ "convert_h3(con, s3, folder = folder, file = f\"{name}.parquet\", cols= [\"id\"])\n",
951
+ "\n",
952
+ "# convert_h3_2(con, folder = folder, file = f\"{name}.parquet\", cols= [\"id\"])\n",
953
+ "\n"
954
+ ]
955
+ },
956
+ {
957
+ "cell_type": "code",
958
+ "execution_count": null,
959
+ "id": "908786ec-2a86-4fb0-a47a-9de364254806",
960
+ "metadata": {},
961
+ "outputs": [],
962
+ "source": [
963
+ "# url = f\"s3://public-ca30x30/{folder}/hex/{name}.parquet\"\n",
964
+ "# # url = f\"s3://public-ca30x30/{folder}/{name}.parquet\"\n",
965
+ "\n",
966
+ "# con.read_parquet(url).head(10).execute()\n"
967
+ ]
968
+ },
969
+ {
970
+ "cell_type": "code",
971
+ "execution_count": null,
972
+ "id": "65d98aa3-041f-42d6-8448-6fa8a05f5850",
973
+ "metadata": {},
974
+ "outputs": [],
975
+ "source": [
976
+ "# file = 'ca-30x30-base.parquet'\n",
977
+ "# folder = \"Preprocessing\"\n",
978
+ "# bucket = 'public-ca30x30'\n",
979
+ "# con.read_parquet(f\"s3://{bucket}/{folder}/hex/{file}\").head(10).execute()"
980
  ]
981
+ },
982
+ {
983
+ "cell_type": "code",
984
+ "execution_count": null,
985
+ "id": "fa960a99-3c79-4e67-becf-a1cb397aa5fb",
986
+ "metadata": {},
987
+ "outputs": [],
988
+ "source": []
989
  }
990
  ],
991
  "metadata": {
 
1004
  "name": "python",
1005
  "nbconvert_exporter": "python",
1006
  "pygments_lexer": "ipython3",
1007
+ "version": "3.12.9"
1008
  }
1009
  },
1010
  "nbformat": 4,
preprocess/h3_utils.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def geom_to_h3(con, name, cols, zoom):
3
+ """
4
+ Computes hexes
5
+ """
6
+ con.raw_sql(f'''
7
+ CREATE OR REPLACE TEMP TABLE t2 AS
8
+ WITH t1 AS (
9
+ SELECT {cols}, ST_Dump(geom) AS geom
10
+ FROM {name}
11
+ )
12
+ SELECT {cols},
13
+ h3_polygon_wkt_to_cells_string(UNNEST(geom).geom, {zoom}) AS h{zoom}
14
+ FROM t1
15
+ ''')
16
+
17
+ def check_size(con, name, zoom, sample_size=100):
18
+ """
19
+ Estimating size of geoms to decide if we need to process in chunks
20
+ """
21
+ query = f"""
22
+ SELECT
23
+ avg(len(h3_polygon_wkt_to_cells_string(ST_AsText(geom), {zoom}))::DOUBLE) AS avg_h3_len,
24
+ max(len(h3_polygon_wkt_to_cells_string(ST_AsText(geom), {zoom}))) AS max_h3_len,
25
+ count(*) AS total_rows
26
+ FROM {name}
27
+ USING SAMPLE {sample_size}
28
+ """
29
+ stats = con.sql(query).execute()
30
+ avg_len = stats.iloc[0]['avg_h3_len']
31
+ max_len = stats.iloc[0]['max_h3_len']
32
+ total_rows = con.table(name).count().execute()
33
+
34
+ est_total_h3 = avg_len * total_rows
35
+
36
+ print(f"Estimated total H3 cells: {est_total_h3:,.0f}")
37
+ print(f"Max H3 cells in one geometry: {max_len:,}")
38
+
39
+ return est_total_h3, max_len
40
+
41
+
42
+ def write_large_geoms(con, s3, bucket, path, name, zoom="8", geom_len_threshold=10_000):
43
+ """
44
+ Individually processing large geoms (different from processing "chunks")
45
+ """
46
+ offset = 0
47
+ i = 0
48
+ limit=3000
49
+ while True:
50
+ large_key = f"{path}/hex/{name}_large_{i:03d}.parquet"
51
+ print(f"🟠 Checking large geometry batch {i} → {large_key}")
52
+
53
+ # check if file already exists in minio
54
+ try:
55
+ s3.stat_object(bucket, large_key)
56
+ print(f"⏩ Skipping existing large batch: {large_key}")
57
+ offset += limit
58
+ i += 1
59
+ continue
60
+ except S3Error as err:
61
+ if err.code != "NoSuchKey":
62
+ raise
63
+
64
+ print(f"📝 Writing large geometry batch {i} → {large_key}")
65
+ q = con.sql(f'''
66
+ SELECT *, UNNEST(h{zoom}) AS h{zoom}
67
+ FROM t2
68
+ WHERE len(h{zoom}) > {geom_len_threshold}
69
+ LIMIT {limit} OFFSET {offset}
70
+ ''')
71
+
72
+ q.to_parquet(f"s3://{bucket}/{large_key}")
73
+
74
+ if q.count().execute() == 0:
75
+ break
76
+
77
+ offset += limit
78
+ i += 1
79
+
80
+ return i
81
+
82
+
83
+ def join_large_geoms(con, s3, bucket, path, name):
84
+ """
85
+ If we had to process large geoms individually, join those datasets after conversion.
86
+ """
87
+ # check if any large files exist before trying to join
88
+ test_key = f"{path}/hex/{name}_large_000.parquet"
89
+
90
+ try:
91
+ s3.stat_object(bucket, test_key)
92
+ except S3Error as err:
93
+ if err.code == "NoSuchKey":
94
+ print("✅ No large geometry chunks to join.")
95
+ return
96
+ else:
97
+ raise
98
+ # join if it exists
99
+ con.raw_sql(f'''
100
+ COPY (
101
+ SELECT * FROM read_parquet('s3://{bucket}/{path}/hex/{name}_large_*.parquet')
102
+ )
103
+ TO 's3://{bucket}/{path}/hex/{name}_large.parquet'
104
+ (FORMAT PARQUET)
105
+ ''')
106
+
107
+
108
+ def chunk_data(con, s3, bucket, path, name, zoom="8", limit=100_000, geom_len_threshold=10_000):
109
+ """
110
+ Processing large files in chunks.
111
+ """
112
+ offset = 0
113
+ i = 0
114
+
115
+ while True:
116
+ chunk_path = f"{path}/hex/{name}_chunk{i:03d}.parquet"
117
+
118
+ try:
119
+ s3.stat_object(bucket, chunk_path)
120
+ print(f"⏩ Skipping existing chunk: {chunk_path}")
121
+ offset += limit
122
+ i += 1
123
+ continue
124
+ except S3Error as err:
125
+ if err.code != "NoSuchKey":
126
+ raise
127
+
128
+ print(f"📝 Writing chunk {i} → {chunk_path}")
129
+ q = con.sql(f'''
130
+ SELECT *, UNNEST(h{zoom}) AS h{zoom}
131
+ FROM t2
132
+ WHERE len(h{zoom}) <= {geom_len_threshold}
133
+ LIMIT {limit} OFFSET {offset}
134
+ ''')
135
+ q.to_parquet(f"s3://{bucket}/{chunk_path}")
136
+ if q.count().execute() == 0:
137
+ break
138
+ offset += limit
139
+ i += 1
140
+
141
+ # process large geometries using same threshold and limit
142
+ write_large_geoms(con, s3, bucket, path, name, zoom, geom_len_threshold=geom_len_threshold)
143
+ join_large_geoms(con, s3, bucket, path, name)
144
+ return i
145
+
146
+
147
+
148
+ def join_chunked(bucket, path, name):
149
+ """
150
+ If we had to chunk the data, join those datasets after conversion.
151
+ """
152
+ con.raw_sql(f'''
153
+ COPY (
154
+ SELECT * FROM read_parquet('s3://{bucket}/{path}/hex/{name}_chunk*.parquet')
155
+ )
156
+ TO 's3://{bucket}/{path}/hex/{name}.parquet'
157
+ (FORMAT PARQUET)
158
+ ''')
159
+
160
+ # def convert_h3(con, folder, file, cols, zoom="8", limit=100_000, geom_len_threshold=10_000):
161
+ def convert_h3(con, s3, folder, file, cols, zoom="8", limit=100_000, geom_len_threshold=5_000):
162
+ """
163
+ Driver function to convert geometries to h3
164
+ """
165
+ cols = ", ".join(cols) if isinstance(cols, list) else cols
166
+ bucket, path = info(folder, file)
167
+ path, file = os.path.split(path)
168
+ name, ext = os.path.splitext(file)
169
+ name = name.replace('-', '')
170
+
171
+ print(f"Processing: {name}")
172
+ con.read_parquet(f"s3://{bucket}/{path}/{file}", table_name=name)
173
+
174
+ # Decide to chunk or not
175
+ est_total, max_per_geom = check_size(con, name, zoom)
176
+ # if est_total > 500_000 or max_per_geom > geom_len_threshold:
177
+
178
+ if est_total > 1_000_000 or max_per_geom > geom_len_threshold:
179
+ print("Chunking due to estimated size")
180
+ geom_to_h3(con, name, cols, zoom)
181
+ chunk_data(con, s3, bucket, path, name, zoom, limit, geom_len_threshold)
182
+ join_chunked(con, bucket, path, name)
183
+ else:
184
+ print("Writing single output")
185
+ geom_to_h3(con, name, cols, zoom)
186
+ con.sql(f'''
187
+ SELECT *, UNNEST(h{zoom}) AS h{zoom}
188
+ FROM t2
189
+ ''').to_parquet(f"s3://{bucket}/{path}/hex/{name}.parquet")
preprocess/preprocess.ipynb CHANGED
@@ -10,7 +10,7 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": null,
14
  "id": "f7e6298c-d886-432a-a1b7-c3fee914c24f",
15
  "metadata": {
16
  "editable": true,
@@ -25,12 +25,13 @@
25
  "data_path = '../data/CBN-layers/'\n",
26
  "os.chdir(data_path)\n",
27
  "\n",
28
- "from cng.utils import ST_MakeValid, set_secrets, s3_client, s3_cp, to_pmtiles\n",
29
  "s3 = s3_client()\n",
30
  "\n",
 
31
  "import ibis\n",
32
- "\n",
33
  "from ibis import _\n",
 
34
  "con = ibis.duckdb.connect(extensions=[\"spatial\"])\n",
35
  "\n",
36
  "import geopandas as gpd\n",
@@ -47,7 +48,7 @@
47
  },
48
  {
49
  "cell_type": "code",
50
- "execution_count": null,
51
  "id": "63dd33b8-6d3c-4852-9899-6ed5775d19c0",
52
  "metadata": {},
53
  "outputs": [],
@@ -60,7 +61,13 @@
60
  " else:\n",
61
  " path = os.path.join(bucket,base_folder,folder,file)\n",
62
  " url = minio+path\n",
63
- " return url"
 
 
 
 
 
 
64
  ]
65
  },
66
  {
@@ -73,14 +80,14 @@
73
  },
74
  {
75
  "cell_type": "code",
76
- "execution_count": null,
77
  "id": "13214bbe-3a74-4247-981f-5a6eb6c486f5",
78
  "metadata": {},
79
  "outputs": [],
80
  "source": [
81
  "# CA Nature data \n",
82
- "# ca_raw_parquet = \"https://data.source.coop/cboettig/ca30x30/ca_areas.parquet\"\n",
83
- "ca_raw_parquet = 'ca_areas.parquet'\n",
84
  "\n",
85
  "# Boundary of CA, used to computed 'non-conserved' areas\n",
86
  "ca_boundary_parquet = get_url('Preprocessing','ca_boundary.parquet',base_folder = None)\n",
@@ -160,11 +167,45 @@
160
  },
161
  {
162
  "cell_type": "code",
163
- "execution_count": null,
164
  "id": "0f9666d1-7c2b-45af-9399-e4189bba34f5",
165
  "metadata": {},
166
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  "source": [
 
168
  "# match CA Nature schema \n",
169
  "\n",
170
  "non_conserved = (con.read_parquet(ca_nonconserved_url)\n",
@@ -200,7 +241,7 @@
200
  },
201
  {
202
  "cell_type": "code",
203
- "execution_count": null,
204
  "id": "a3d4f189-1563-4868-9f1f-64d67569df27",
205
  "metadata": {},
206
  "outputs": [],
@@ -257,7 +298,7 @@
257
  },
258
  {
259
  "cell_type": "code",
260
- "execution_count": null,
261
  "id": "a59c976b-3c36-40f9-a15b-cefcd155c647",
262
  "metadata": {},
263
  "outputs": [],
@@ -303,11 +344,60 @@
303
  },
304
  {
305
  "cell_type": "code",
306
- "execution_count": null,
307
  "id": "4d6177e2-8ece-4eb9-acc2-5fb5c5beb8bb",
308
  "metadata": {},
309
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  "source": [
 
311
  "counties = con.read_parquet('../CA_counties.parquet')\n",
312
  "# ca = con.read_parquet(ca_temp_parquet)\n",
313
  "\n",
@@ -570,7 +660,7 @@
570
  "name": "python",
571
  "nbconvert_exporter": "python",
572
  "pygments_lexer": "ipython3",
573
- "version": "3.12.8"
574
  }
575
  },
576
  "nbformat": 4,
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": 1,
14
  "id": "f7e6298c-d886-432a-a1b7-c3fee914c24f",
15
  "metadata": {
16
  "editable": true,
 
25
  "data_path = '../data/CBN-layers/'\n",
26
  "os.chdir(data_path)\n",
27
  "\n",
28
+ "from cng.utils import set_secrets, s3_client, s3_cp, to_pmtiles\n",
29
  "s3 = s3_client()\n",
30
  "\n",
31
+ " \n",
32
  "import ibis\n",
 
33
  "from ibis import _\n",
34
+ "import ibis.expr.datatypes as dt\n",
35
  "con = ibis.duckdb.connect(extensions=[\"spatial\"])\n",
36
  "\n",
37
  "import geopandas as gpd\n",
 
48
  },
49
  {
50
  "cell_type": "code",
51
+ "execution_count": 2,
52
  "id": "63dd33b8-6d3c-4852-9899-6ed5775d19c0",
53
  "metadata": {},
54
  "outputs": [],
 
61
  " else:\n",
62
  " path = os.path.join(bucket,base_folder,folder,file)\n",
63
  " url = minio+path\n",
64
+ " return url\n",
65
+ "\n",
66
+ "\n",
67
+ "# usage: t.mutate(geom_valid = ST_MakeValid(t.geom))\n",
68
+ "@ibis.udf.scalar.builtin\n",
69
+ "def ST_MakeValid(geom) -> dt.geometry:\n",
70
+ " ..."
71
  ]
72
  },
73
  {
 
80
  },
81
  {
82
  "cell_type": "code",
83
+ "execution_count": 3,
84
  "id": "13214bbe-3a74-4247-981f-5a6eb6c486f5",
85
  "metadata": {},
86
  "outputs": [],
87
  "source": [
88
  "# CA Nature data \n",
89
+ "ca_raw_parquet = \"https://data.source.coop/cboettig/ca30x30/ca_areas.parquet\"\n",
90
+ "# ca_raw_parquet = 'ca_areas.parquet'\n",
91
  "\n",
92
  "# Boundary of CA, used to computed 'non-conserved' areas\n",
93
  "ca_boundary_parquet = get_url('Preprocessing','ca_boundary.parquet',base_folder = None)\n",
 
167
  },
168
  {
169
  "cell_type": "code",
170
+ "execution_count": 7,
171
  "id": "0f9666d1-7c2b-45af-9399-e4189bba34f5",
172
  "metadata": {},
173
+ "outputs": [
174
+ {
175
+ "data": {
176
+ "application/vnd.jupyter.widget-view+json": {
177
+ "model_id": "52ef18913f17417299860d91e36e9dbd",
178
+ "version_major": 2,
179
+ "version_minor": 0
180
+ },
181
+ "text/plain": [
182
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
183
+ ]
184
+ },
185
+ "metadata": {},
186
+ "output_type": "display_data"
187
+ },
188
+ {
189
+ "name": "stdout",
190
+ "output_type": "stream",
191
+ "text": [
192
+ "CPU times: user 4min 28s, sys: 6.1 s, total: 4min 34s\n",
193
+ "Wall time: 2min 18s\n"
194
+ ]
195
+ },
196
+ {
197
+ "data": {
198
+ "text/plain": [
199
+ "<minio.helpers.ObjectWriteResult at 0x7ff0943c7710>"
200
+ ]
201
+ },
202
+ "execution_count": 7,
203
+ "metadata": {},
204
+ "output_type": "execute_result"
205
+ }
206
+ ],
207
  "source": [
208
+ "%%time \n",
209
  "# match CA Nature schema \n",
210
  "\n",
211
  "non_conserved = (con.read_parquet(ca_nonconserved_url)\n",
 
241
  },
242
  {
243
  "cell_type": "code",
244
+ "execution_count": 4,
245
  "id": "a3d4f189-1563-4868-9f1f-64d67569df27",
246
  "metadata": {},
247
  "outputs": [],
 
298
  },
299
  {
300
  "cell_type": "code",
301
+ "execution_count": 5,
302
  "id": "a59c976b-3c36-40f9-a15b-cefcd155c647",
303
  "metadata": {},
304
  "outputs": [],
 
344
  },
345
  {
346
  "cell_type": "code",
347
+ "execution_count": 6,
348
  "id": "4d6177e2-8ece-4eb9-acc2-5fb5c5beb8bb",
349
  "metadata": {},
350
+ "outputs": [
351
+ {
352
+ "data": {
353
+ "application/vnd.jupyter.widget-view+json": {
354
+ "model_id": "09f24f1359a84ae2a4b69360cc8e852b",
355
+ "version_major": 2,
356
+ "version_minor": 0
357
+ },
358
+ "text/plain": [
359
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
360
+ ]
361
+ },
362
+ "metadata": {},
363
+ "output_type": "display_data"
364
+ },
365
+ {
366
+ "data": {
367
+ "application/vnd.jupyter.widget-view+json": {
368
+ "model_id": "c10ce980d24e45b6bad9b8a70c176f2c",
369
+ "version_major": 2,
370
+ "version_minor": 0
371
+ },
372
+ "text/plain": [
373
+ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
374
+ ]
375
+ },
376
+ "metadata": {},
377
+ "output_type": "display_data"
378
+ },
379
+ {
380
+ "name": "stderr",
381
+ "output_type": "stream",
382
+ "text": [
383
+ "/opt/conda/lib/python3.12/site-packages/ibis/common/deferred.py:408: FutureWarning: `Value.case` is deprecated as of v10.0.0; use value.cases() or ibis.cases()\n",
384
+ " return func(*args, **kwargs)\n"
385
+ ]
386
+ },
387
+ {
388
+ "ename": "NameError",
389
+ "evalue": "name 'non_conserved' is not defined",
390
+ "output_type": "error",
391
+ "traceback": [
392
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
393
+ "\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
394
+ "\u001b[36mFile \u001b[39m\u001b[32m<timed exec>:50\u001b[39m\n",
395
+ "\u001b[31mNameError\u001b[39m: name 'non_conserved' is not defined"
396
+ ]
397
+ }
398
+ ],
399
  "source": [
400
+ "%%time \n",
401
  "counties = con.read_parquet('../CA_counties.parquet')\n",
402
  "# ca = con.read_parquet(ca_temp_parquet)\n",
403
  "\n",
 
660
  "name": "python",
661
  "nbconvert_exporter": "python",
662
  "pygments_lexer": "ipython3",
663
+ "version": "3.12.9"
664
  }
665
  },
666
  "nbformat": 4,
preprocess/utils.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from minio.error import S3Error
2
+
3
+ import zipfile
4
+ import os
5
+ import subprocess
6
+
7
+ import geopandas as gpd
8
+ import ibis
9
+ from ibis import _
10
+
11
+ import rasterio
12
+ from rasterio.features import shapes
13
+ from shapely.geometry import shape
14
+ import numpy as np
15
+
16
+
17
+ def info(folder, file, bucket = "public-ca30x30", base_folder = 'CBN-data/'):
18
+ """
19
+ Extract minio path to upload/download data
20
+ """
21
+ path = os.path.join(base_folder, folder, file)
22
+ # path = os.path.join(folder, file)
23
+ return bucket, path
24
+
25
+ def download(s3, folder, file, file_name = None):
26
+ """
27
+ Downloading file from minio
28
+ """
29
+ if not file_name:
30
+ file_name = file
31
+ bucket, path = info(folder, file)
32
+ s3.fget_object(bucket, path ,file_name)
33
+ return
34
+
35
+ def upload(s3, folder, file):
36
+ """
37
+ Uploading file from minio
38
+ """
39
+ bucket, path = info(folder, file)
40
+ s3.fput_object(bucket, path ,file)
41
+ return
42
+
43
+ def unzip(folder, file):
44
+ """
45
+ Unzipping zip files
46
+ """
47
+ download(s3, folder, file)
48
+ with zipfile.ZipFile(file, 'r') as zip_ref:
49
+ zip_ref.extractall()
50
+ return
51
+
52
+ # def process_vector(folder, file, file_name = None, gdf = None, crs="EPSG:3310"):
53
+ def process_vector(folder, file, file_name = None, gdf = None, crs="EPSG:4326"):
54
+ """
55
+ Driver function to process vectors
56
+ """
57
+ if gdf is None:
58
+ gdf = gpd.read_file(file)
59
+ if gdf.crs != crs:
60
+ gdf = gdf.to_crs(crs)
61
+ if gdf.geometry.name != 'geom':
62
+ gdf = gdf.rename_geometry('geom')
63
+ if file_name:
64
+ file = file_name
65
+ # upload_parquet(folder, file, gdf)
66
+ name, ext = os.path.splitext(file)
67
+ parquet_file = f"{name}{'.parquet'}"
68
+ gdf.to_parquet(parquet_file)
69
+ upload(s3, folder, parquet_file)
70
+ return
71
+
72
+
73
+ # def upload_parquet(folder, file, gdf):
74
+ # """
75
+ # Uploading parquets
76
+ # """
77
+ # name, ext = os.path.splitext(file)
78
+ # parquet_file = f"{name}{'.parquet'}"
79
+ # gdf.to_parquet(parquet_file)
80
+ # upload(folder, parquet_file)
81
+ # return
82
+
83
+
84
+ def reproject_raster(input_file, crs="EPSG:3310"):
85
+ """
86
+ Reproject rasters
87
+ """
88
+ suffix = '_processed'
89
+ name, ext = os.path.splitext(input_file)
90
+ output_file = f"{name}{suffix}{ext}"
91
+ command = [
92
+ "gdalwarp",
93
+ "-t_srs", crs,
94
+ input_file,
95
+ output_file
96
+ ]
97
+ try:
98
+ subprocess.run(command, check=True)
99
+ print(f"Reprojection successful!")
100
+ except subprocess.CalledProcessError as e:
101
+ print(f"Error occurred during reprojection: {e}")
102
+ return output_file
103
+
104
+ def make_cog(input_file, crs="EPSG:4326"):
105
+ """
106
+ Converting TIF to COGs
107
+ """
108
+ suffix = '_COG'
109
+ name, ext = os.path.splitext(input_file)
110
+ output_file = f"{name}{suffix}{ext}"
111
+ command = [
112
+ "gdalwarp",
113
+ "-t_srs", crs,
114
+ "-of", "COG",
115
+ input_file,
116
+ output_file
117
+ ]
118
+ try:
119
+ subprocess.run(command, check=True)
120
+ print(f"Successful!")
121
+ except subprocess.CalledProcessError as e:
122
+ print(f"Error occurred during processing: {e}")
123
+ return output_file
124
+
125
+ def make_vector(input_file, crs="EPSG:4326"):
126
+ """
127
+ Converting rasters to vector formats in order to convert to h3
128
+ """
129
+ name, ext = os.path.splitext(input_file)
130
+ output_file = f"{name}.parquet"
131
+ # Open raster
132
+ with rasterio.open(input_file) as src:
133
+ image = src.read(1) # read first band
134
+ mask = image != src.nodata # mask out nodata
135
+
136
+ results = (
137
+ {"geom": shape(geom), "value": value}
138
+ for geom, value in shapes(image, mask=mask, transform=src.transform)
139
+ )
140
+
141
+ gdf = gpd.GeoDataFrame.from_records(results)
142
+ gdf.set_geometry('geom', inplace=True)
143
+ gdf['id'] = np.arange(len(gdf))
144
+ gdf.set_crs(src.crs, inplace=True)
145
+ if gdf.crs != crs:
146
+ gdf.to_crs(crs, inplace=True)
147
+
148
+ gdf.to_parquet(output_file)
149
+ print(gdf)
150
+ return output_file
151
+
152
+ def filter_raster(s3, folder, file, percentile):
153
+ """
154
+ Helper function to filter rasteres
155
+ """
156
+ with rasterio.open(file) as src:
157
+ data = src.read(1) # Read the first band
158
+ profile = src.profile
159
+ # mask no data values
160
+ masked_data = np.ma.masked_equal(data, src.nodata)
161
+
162
+ # compute percentile/threshold
163
+ p = np.percentile(masked_data.compressed(),percentile)
164
+ filtered = np.where(data >= p, data, src.nodata)
165
+ name, ext = os.path.splitext(file)
166
+ new_file = f"{name}{'_'}{percentile}{'percentile'}{ext}"
167
+
168
+ profile.update(dtype=rasterio.float64)
169
+ with rasterio.open(new_file, "w", **profile) as dst:
170
+ dst.write(filtered, 1)
171
+ process_raster(s3, folder, file)
172
+ return
173
+
174
+ def process_raster(s3, folder, file, file_name = None):
175
+ """
176
+ Driver function to process rasters
177
+ """
178
+ if file_name:
179
+ file = file_name
180
+ output_file = reproject_raster(file)
181
+ upload(s3, folder, output_file)
182
+ output_cog_file = make_cog(output_file)
183
+ upload(s3, folder, output_cog_file)
184
+ output_vector = make_vector(output_file)
185
+ upload(s3, folder, output_vector)
186
+ return
187
+
188
+ def convert_pmtiles(folder, file):
189
+ """
190
+ Convert to PMTiles with tippecanoe
191
+ """
192
+ name, ext = os.path.splitext(file)
193
+ if ext != '.geojson':
194
+ con.read_parquet(file).execute().set_crs('epsg:3310').to_crs('epsg:4326').to_file(name+'.geojson')
195
+ to_pmtiles(name+'.geojson', name+'.pmtiles', options = ['--extend-zooms-if-still-dropping'])
196
+ upload(s3, folder, name+'.pmtiles')
197
+ return
198
+