Source code for refresh_database

#!/usr/bin/python3

"""
Script to manually update the database of GADM.

When GADM is releasing a new version the table should be updated to make sure that the names are still available in list and that new one are included. It is only meant to be executed by maintainer, Any PR included unwanted modifications to the database will be refused.
"""

import argparse
import tempfile
import zipfile
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlopen

import geopandas as gpd
import pandas as pd
from tqdm import tqdm

from pygadm import __gadm_version__

[docs]parser = argparse.ArgumentParser(description=__doc__, usage="refresh_database")

if __name__ == "__main__":

    # read arguments
    parser.add_argument(
        "-f",
        dest="gadm_src",
        metavar="source.gpkg",
        help="(str) : path to the GADM file",
        required=False,
        type=Path,
    )

    # parse arguments
[docs]    args = parser.parse_args()

    # url of the gadm files
    url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_{__gadm_version__}-level.zip"

    # read the all the geodata available in the server at once
    with tempfile.TemporaryDirectory() as tmp_dir:

        # check if a download is required
        if vars(args)["gadm_src"] is not None:
            zip_file = Path(vars(args)["gadm_src"])

        else:
            # get the file as a simple dataframe
            zip_file = Path(tmp_dir) / urlparse(url).path.split("/")[-1]
            response = urlopen(url)
            pbar = tqdm(total=response.length, unit="iB", unit_scale=True)
            size = 16 * 1024
            with open(zip_file, "wb") as f:
                while True:
                    chunk = response.read(size)
                    pbar.update(size)
                    if not chunk:
                        break
                    f.write(chunk)

        # unzip file
        file = Path(tmp_dir) / f"gadm_{__gadm_version__}-levels.gpkg"
        with zipfile.ZipFile(zip_file, "r") as zip_ref:
            zip_ref.extractall(Path(tmp_dir))

        # read the file layer by layer
        gid_0 = gpd.read_file(file, layer="ADM_0", ignore_geometry=True)
        gid_1 = gpd.read_file(file, layer="ADM_1", ignore_geometry=True)
        gid_2 = gpd.read_file(file, layer="ADM_2", ignore_geometry=True)
        gid_3 = gpd.read_file(file, layer="ADM_3", ignore_geometry=True)
        gid_4 = gpd.read_file(file, layer="ADM_4", ignore_geometry=True)
        gid_5 = gpd.read_file(file, layer="ADM_5", ignore_geometry=True)

        # concatenate all the df in area size order
        df = pd.concat([gid_0, gid_1, gid_2, gid_3, gid_4, gid_5])

    # change database structure to meet pygadm requirements
    df = df.fillna("").rename(columns={"COUNTRY": "NAME_0"})

    # filter all columns but the GID and the NAME
    # we are not including the VARNAME to keep the file size under 3Mo
    columns = ["UID"]
    columns += [f"GID_{i}" for i in range(6)]
    columns += [f"NAME_{i}" for i in range(6)]
    df_filtered = df.filter(items=columns)

    # save it in the data folder
    filename = Path(__file__).parents[1] / "data" / "gadm_database.parquet"

    # specifying the protocol for compatibility with Python 3.7
    df_filtered.to_parquet(filename, compression="Brotli")