Source code for pygadm

"""
Easy access to administrative boundary defined by GADM from Python scripts.

This lib provides access to GADM datasets from a Python script without downloading the file from their server. We provide access to The current version (4.1.) which delimits 400,276 administrative areas.

The data are freely available for academic use and other non-commercial use. Redistribution, or commercial use is not allowed without prior permission. See the license of the GADM project for more details.
"""

import json
import warnings
from difflib import get_close_matches
from itertools import product
from pathlib import Path
from typing import List, Union

import geopandas as gpd
import numpy as np
import pandas as pd
from requests_cache import CachedSession

[docs]session = CachedSession("pygadm", use_temp=True)
[docs]__version__ = "0.3.0"
[docs]__author__ = "Pierrick Rambaud"
[docs]__email__ = "pierrick.rambaud49@gmail.com"
[docs]__gadm_version__ = "410" # 4.1
[docs]__gadm_url__ = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{}_{}.json"
[docs]__gadm_data__ = Path(__file__).parent / "data" / "gadm_database.parquet"
[docs]__gadm_continent__ = json.loads( (Path(__file__).parent / "data" / "gadm_continent.json").read_text() )
# __all__ = ["get_items", "get_names"]
[docs]def get_items( name: Union[str, List[str]] = "", admin: Union[str, List[str]] = "", content_level: int = -1, ) -> gpd.GeoDataFrame: """ Return the requested administrative boundaries using the name or the administrative code. Return a Geopandas GeoDataFrame representing an administrative region. The region can be requested either by its "name" or its "admin", the lib will identify the area level on the fly. The user can also request for a specific level for the GeoDataFrame features e.g. get all admin level 1 of a country. If nothing is set we will infer the level of the item and if the level is higher than the found item, it will be ignored. If Nothing is found the method will return an error. Args: name: The name of an administrative area. Cannot be set along with :code:`admin`. it can be a list or a single name. admin: The id of an administrative area in the GADM nomenclature. Cannot be set along with :code:`name`. It can be a list or a single admin code. content_level: The level to use in the final dataset. Default to -1 (use level from the area). Returns: The GeoDataFrame of the requested area with all the GADM attributes. """ # set up the loop names = [name] if isinstance(name, str) else name admins = [admin] if isinstance(admin, str) else admin # check that they are not all empty if names == [""] == admins: raise ValueError('at least "name" or "admin" need to be set.') # special parsing for continents. They are saved as admins to avoid any duplication if len(names) == 1 and names[0].lower() in __gadm_continent__: admins = [c for c in __gadm_continent__[names[0].lower()]] names = [""] # use itertools, normally one of them is empty so it will raise an error # if not the case as admin and name will be set together gdf_list = [_items(n, a, content_level) for a, n in product(admins, names)] # avoid concat if not needed for speed boost gdf = gdf_list[0] if len(gdf_list) == 1 else pd.concat(gdf_list) return gdf
[docs]def _items( name: str = "", admin: str = "", content_level: int = -1 ) -> gpd.GeoDataFrame: """ Return the requested administrative boundaries using the name or the administrative code. Same method as get_items but only accept single requests in str format. Args: name: The name of an administrative area. Cannot be set along with :code:`admin`. admin: The id of an administrative area in the GADM nomenclature. Cannot be set along with :code:`name`. content_level: The level to use in the final dataset. Default to -1 (use level from the area). Returns: The GeoDataFrame of the requested area with all the GADM attributes. """ # call to get_names without level to raise an error if the requested level won't work df = get_names(name, admin) if len(df) > 1: raise ValueError( f'The requested name ("{name}") is not unique ({len(df)} results). To retrieve it, please use the "admin" parameter instead. If you don\'t know the GADM code, use the following code, it will return the GADM codes as well: "get_names(name="{name}")"' ) level = df.columns[0].replace("NAME_", "") iso_3 = df.iloc[0][f"GID_{level}"][:3] # now load the useful one to get content_level df = get_names(name, admin, content_level) content_level = df.columns[0].replace("NAME_", "") # checks have already been performed in get_names column = "NAME_{}" if name else "GID_{}" id = name if name else admin # read the data from server url = __gadm_url__.format(iso_3, content_level) try: data = json.loads(session.get(url).content) except Exception: # The data url is automatically build, it should be correct. From time # to time the server are down from GADM side so we wrie down a specific # error message if something goes wrong raise Exception( f"We cannot retrieve the data from GADM server. Try to manually open the following link: {url}. If it doesn't work, the error is coming from GADM servers. If it works please open an issue on our repository: https://github.com/12rambau/pygadm/issues." ) level_gdf = gpd.GeoDataFrame.from_features(data) level_gdf.rename(columns={"COUNTRY": "NAME_0"}, inplace=True) gdf = level_gdf[level_gdf[column.format(level)].str.fullmatch(id, case=False)] return gdf
[docs]def get_names( name: str = "", admin: str = "", content_level: int = -1, complete: bool = False ) -> pd.DataFrame: """ Return the list of names available in a administrative layer using the name or the administrative code. Return a pandas DataFrame of the names ad GADM code of an administrative region. The region can be requested either by its "name" or its "admin", the lib will identify the corresponding level on the fly. The user can also request for a specific level for its content e.g. get all admin level 1 of a country. If nothing is set we will infer the level of the item and if the level is higher than the found item, it will be ignored. If Nothing is found the method will return an error. Args: name: The name of a administrative area. Cannot be set along with :code:`admin`. admin: The id of an administrative area in the GADM nomenclature. Cannot be set along with :code:`name`. content_level: The level to use in the final dataset. Default to -1 (use level of the selected area). complete: If True, the method will return all the names of the higher administrative areas. Default to False. Returns: The list of all the available names. """ # sanitary check on parameters if name and admin: raise ValueError('"name" and "id" cannot be set at the same time.') elif not name and not admin: raise ValueError('at least "name" or "admin" need to be set.') # set the id we look for and tell the function if its a name or an admin is_name = True if name else False id = name if name else admin # read the data and find if the element exist df = pd.read_parquet(__gadm_data__) column = "NAME_{}" if is_name else "GID_{}" is_in = ( df.filter([column.format(i) for i in range(6)]) .apply(lambda col: col.str.lower()) .isin([id.lower()]) ) if not is_in.any().any(): # find the 5 closest names/id columns = [df[column.format(i)].dropna().str.lower().values for i in range(6)] ids = np.unique(np.concatenate(columns)) close_ids = get_close_matches(id.lower(), ids, n=5) if is_name is True: close_ids = [i.capitalize() for i in close_ids] else: close_ids = [i.upper() for i in close_ids] raise ValueError( f'The requested "{id}" is not part of GADM. The closest matches are: {", ".join(close_ids)}.' ) # Get the iso_3 of the associated country of the identifed area and the associated level line = is_in[~((~is_in).all(axis=1))].idxmax(1) level = line.iloc[0][5 if is_name else 4] # GID_ or NAME_ # load the max_level available in the requested area sub_df = df[df[column.format(level)].str.fullmatch(id, case=False)] max_level = next(i for i in reversed(range(6)) if (sub_df[f"GID_{i}"] != "").any()) # get the request level from user if content_level == -1: content_level = level elif content_level < int(level): warnings.warn( f"The requested level ({content_level}) is higher than the area ({level}). Fallback to {level}." ) content_level = level if int(content_level) > max_level: warnings.warn( f"The requested level ({content_level}) is higher than the max level in this country ({max_level}). Fallback to {max_level}." ) content_level = max_level # get the columns name to display columns = [f"NAME_{content_level}", f"GID_{content_level}"] # the list will contain duplicate as all the smaller admin level will be included sub_df = sub_df.drop_duplicates(subset=columns, ignore_index=True) # the list will contain NA as all the bigger admin level will be selected as well # the database is read as pure string so dropna cannot be used # .astype is also a vectorized operation so it goes very fast sub_df = sub_df[sub_df[columns[0]].astype(bool)] # filter the df if complete is set to False, the only displayed columns will be the one requested final_df = sub_df if complete is True else sub_df[columns] return final_df