Source code for refresh_database
#!/usr/bin/python3
"""
Script to manually update the database of GADM.
When GADM is releasing a new version the table should be updated to make sure that the names are still available in list and that new one are included. It is only meant to be executed by maintainer, Any PR included unwanted modifications to the database will be refused.
"""
import argparse
import tempfile
import zipfile
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlopen
import geopandas as gpd
import pandas as pd
from tqdm import tqdm
from pygadm import __gadm_version__
[docs]parser = argparse.ArgumentParser(description=__doc__, usage="refresh_database")
if __name__ == "__main__":
# read arguments
parser.add_argument(
"-f",
dest="gadm_src",
metavar="source.gpkg",
help="(str) : path to the GADM file",
required=False,
type=Path,
)
# parse arguments
[docs] args = parser.parse_args()
# url of the gadm files
url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_{__gadm_version__}-level.zip"
# read the all the geodata available in the server at once
with tempfile.TemporaryDirectory() as tmp_dir:
# check if a download is required
if vars(args)["gadm_src"] is not None:
zip_file = Path(vars(args)["gadm_src"])
else:
# get the file as a simple dataframe
zip_file = Path(tmp_dir) / urlparse(url).path.split("/")[-1]
response = urlopen(url)
pbar = tqdm(total=response.length, unit="iB", unit_scale=True)
size = 16 * 1024
with open(zip_file, "wb") as f:
while True:
chunk = response.read(size)
pbar.update(size)
if not chunk:
break
f.write(chunk)
# unzip file
file = Path(tmp_dir) / f"gadm_{__gadm_version__}-levels.gpkg"
with zipfile.ZipFile(zip_file, "r") as zip_ref:
zip_ref.extractall(Path(tmp_dir))
# read the file layer by layer
gid_0 = gpd.read_file(file, layer="ADM_0", ignore_geometry=True)
gid_1 = gpd.read_file(file, layer="ADM_1", ignore_geometry=True)
gid_2 = gpd.read_file(file, layer="ADM_2", ignore_geometry=True)
gid_3 = gpd.read_file(file, layer="ADM_3", ignore_geometry=True)
gid_4 = gpd.read_file(file, layer="ADM_4", ignore_geometry=True)
gid_5 = gpd.read_file(file, layer="ADM_5", ignore_geometry=True)
# concatenate all the df in area size order
df = pd.concat([gid_0, gid_1, gid_2, gid_3, gid_4, gid_5])
# change database structure to meet pygadm requirements
df = df.fillna("").rename(columns={"COUNTRY": "NAME_0"})
# filter all columns but the GID and the NAME
# we are not including the VARNAME to keep the file size under 3Mo
columns = ["UID"]
columns += [f"GID_{i}" for i in range(6)]
columns += [f"NAME_{i}" for i in range(6)]
df_filtered = df.filter(items=columns)
# save it in the data folder
filename = Path(__file__).parents[1] / "data" / "gadm_database.parquet"
# specifying the protocol for compatibility with Python 3.7
df_filtered.to_parquet(filename, compression="Brotli")