From bd2177674c29d6a3c3bd44d94afd6677c3d96ff0 Mon Sep 17 00:00:00 2001 From: ychoquet <53231475+ychoquet@users.noreply.github.com> Date: Wed, 23 Feb 2022 17:27:43 -0500 Subject: [PATCH] Creation of a STAC HATEOAS Provider. (#857) --- docs/source/data-publishing/stac.rst | 302 ++++++++++++++++++++++++++- pygeoapi/plugin.py | 1 + pygeoapi/provider/filesystem.py | 4 +- pygeoapi/provider/hateoas.py | 205 ++++++++++++++++++ pygeoapi/templates/stac/catalog.html | 10 +- pygeoapi/templates/stac/item.html | 2 +- 6 files changed, 513 insertions(+), 11 deletions(-) create mode 100644 pygeoapi/provider/hateoas.py diff --git a/docs/source/data-publishing/stac.rst b/docs/source/data-publishing/stac.rst index 19a1df7..686ae4e 100644 --- a/docs/source/data-publishing/stac.rst +++ b/docs/source/data-publishing/stac.rst @@ -1,13 +1,305 @@ .. _stac: Publishing files to a SpatioTemporal Asset Catalog -================================================== +************************************************** -The `SpatioTemporal Asset Catalog (STAC)`_ specification provides an easy approach -for describing geospatial assets. STAC is typically implemented for imagery and -other raster data. +The `SpatioTemporal Asset Catalog (STAC)`_ family of specifications aim to standardize +the way geospatial asset metadata is structured and queried. A "spatiotemporal asset" +is any file that represents information about the Earth at a certain place and time. +The original focus was on scenes of satellite imagery, but the specifications now cover +a broad variety of uses, including sources such as aircraft and drone and data such as +hyperspectral optical, synthetic aperture radar (SAR), video, point clouds, lidar, digital +elevation models (DEM), vector, machine learning labels, and composites like NDVI and +mosaics. STAC is intentionally designed with a minimal core and flexible extension mechanism +to support a broad set of use cases. This specification has matured over the past several +years, and is used in numerous production deployments. -pygeoapi implements STAC as an geospatial file browser through the FileSystem provider, +pygeoapi has two built-in providers to browse STAC catalogs: `FileSystem Provider`_ and +`Hateoas Provider`_. + +Hateoas Provider +================ + +HATEOAS (Hypermedia as the Engine of Application State) is a way of implementing a REST +application that allows the client to dynamically navigate to the appropriate resources +by browsing hypermedia links. This type of navigation is similar to WEB navigation +and requires a very precise data structure that must be respected to allow the HATEOAS +Provider to behave correctly. + +There are three component specifications (Catalog, Collection, Item) that together make +up the core SpatioTemporal Asset Catalog specification. An Item represents a single +spatiotemporal asset as GeoJSON. The Catalog specification provides structural elements, +to group Items and Collections. Collections are catalogs, that add more required metadata +and describe a group of related Items. + +The full catalog structure of links down to sub-catalogs and Items, and their links back to +their parents and roots, must be done with **relative** URL's for the HATEOAS Provider work +correctly. The structural *rel* types include *root*, *parent*, *child*, *item*, and +*collection*. Assets links must be **absolute** URL's. Other links can be absolute, especially +if they describe a resource that makes less sense in the catalog, like derived_from or even +license (it can be nice to include the license in the catalog, but some licenses live at a +canonical online location which makes more sense to refer to directly). This enables the +full catalog (excluding the assets) to be downloaded or copied to another location and to +still be valid. This also implies no self link, as that link must be absolute. + +So, the following rules must be respected: + +1. Root documents (Catalogs / Collections) must be at the root of a directory tree containing the static catalog. + +2. Catalogs must be named catalog.json and Collections must be named collection.json. + +3. Sub-Catalogs or sub-Collections must be stored in subdirectories of their parent (and only 1 subdirectory deeper than a document's parent, e.g. .../sample/sub1/catalog.json). + +4. Limit the number of Items in a Catalog or Collection, grouping / partitioning as relevant to the dataset. + +5. Use structural elements (Catalog and Collection) consistently across each 'level' of your hierarchy. For example, if levels 2 and 4 of the hierarchy only contain Collections, don't add a Catalog at levels 2 and 4. + +6. Items must be named <*id*>.json. + +7. Items must be stored in subdirectories (1 level deeper) of their parent Catalog or Collection. The subdirectory must have the same name (<*id*>) as the Item without the *.json* extension. This means that each Item are contained in a unique subdirectory. + +8. The links to the actual assets must be an absolute URL. + +------------- + +File examples +------------- + +**Structure of the catalog.json file** + +.. code-block:: json + + { + "id": "STAC-Catalog", + "stac_version": "1.0.0", + "description": "A description of the STAC Catalog", + "links": [ + { + "rel": "root", + "href": "./catalog.json", + "type": "application/json" + }, + { + "rel": "child", + "href": "./eo4ce/catalog.json", + "type": "application/json" + }, + ... + { + "rel": "child", + "href": "./dem/catalog.json", + "type": "application/json" + } + ], + "stac_extensions": [], + "title": "STAC Catalog" + } + +The code above shows the root catalog. The sub-catalogs have an additional ``rel`` entry pointing to the parent. + +.. code-block:: json + + { + "id": "dem", + "stac_version": "1.0.0", + "description": "Digital Elevation Data", + "links": [ + { + "rel": "root", + "href": "../catalog.json", + "type": "application/json" + }, + { + "rel": "child", + "href": "./hrdsm/collection.json", + "type": "application/json" + }, + { + "rel": "parent", + "href": "../catalog.json", + "type": "application/json" + } + ], + "stac_extensions": [], + "title": "DEM" + } + +------------------------------------- + +**Structure of the collection.json file** + +Collections are similar to Catalogs with extra fields. + +.. code-block:: json + + { + "id": "hrdsm", + "stac_version": "1.0.0", + "description": "High Resolution Digital Surface Model", + "links": [ + { + "rel": "root", + "href": "../../catalog.json", + "type": "application/json" + }, + { + "rel": "item", + "href": "./arcticdem-frontiere-0/arcticdem-frontiere-0.json", + "type": "application/json" + }, + ... + { + "rel": "item", + "href": "./arcticdem-frontiere-9/arcticdem-frontiere-9.json", + "type": "application/json" + }, + { + "rel": "parent", + "href": "../catalog.json", + "type": "application/json" + } + ], + "stac_extensions": [], + "extent": { + "spatial": { + "bbox": [ + [ + -142.76516601842533, + 59.65274347822059, + -138.41658819177135, + 69.81052152420365 + ] + ] + }, + "temporal": { + "interval": [ + [ + "2014-09-03T14:00:00Z", + "2020-09-28T15:49:00.559166Z" + ] + ] + } + }, + "license": "proprietary" + } + +------------------------------------- + +**Structure of the Item .json file** + +The example below shows the content of a file named *arcticdem-frontiere-0.json*. + +.. code-block:: json + + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "arcticdem-frontiere-0", + "properties": { + "layer:ids": [ + "dem-hrdsm" + ], + "collection": "hrdsm", + "datetime": "2020-09-28T15:48:56.483794Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + -140.27389595735178, + 59.65274347822059 + ], + [ + -138.41658819177135, + 59.65274347822059 + ], + [ + -138.41658819177135, + 60.579416456816496 + ], + [ + -140.27389595735178, + 60.579416456816496 + ], + [ + -140.27389595735178, + 59.65274347822059 + ] + ] + ] + }, + "links": [ + { + "rel": "root", + "href": "../../../catalog.json", + "type": "application/json" + }, + { + "rel": "collection", + "href": "../collection.json", + "type": "application/json" + }, + { + "rel": "parent", + "href": "../collection.json", + "type": "application/json" + } + ], + "assets": { + "image": { + "href": "http://absolute/path/to/the/ressource/arcticdem-frontiere-0.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "roles": [] + } + }, + "bbox": [ + -140.27389595735178, + 59.65274347822059, + -138.41658819177135, + 60.579416456816496 + ], + "stac_extensions": [], + "collection": "hrdsm" + } + +--------------------- + +HATEOAS Configuration +--------------------- + +Configuring HATEOAS STAC Provider in pygeoapi is done by simply pointing the ``data`` provider property +to the local directory or remote URL and specifying the root file name (catalog.json or collection.json) in the file_types property: + +Connection examples +------------------- + +.. code-block:: yaml + + my-remote-stac-resource: + type: stac-collection + ... + providers: + - type: stac + name: Hateoas + data: https://datacube-dev-data-public.s3.ca-central-1.amazonaws.com/catalog/water + file_types: catalog.json + + my-local-stac-resource: + type: stac-collection + ... + providers: + - type: stac + name: Hateoas + data: tests/stac + file_types: catalog.json + +------------------- + +FileSystem Provider +=================== + +The FileSystem Provider implements STAC as a geospatial file browser through the server's file system, supporting any level of file/directory nesting/hierarchy. Configuring STAC in pygeoapi is done by simply pointing the ``data`` provider property diff --git a/pygeoapi/plugin.py b/pygeoapi/plugin.py index d3e8da4..1c9dc7f 100644 --- a/pygeoapi/plugin.py +++ b/pygeoapi/plugin.py @@ -46,6 +46,7 @@ PLUGINS = { 'SQLiteGPKG': 'pygeoapi.provider.sqlite.SQLiteGPKGProvider', 'MongoDB': 'pygeoapi.provider.mongo.MongoProvider', 'FileSystem': 'pygeoapi.provider.filesystem.FileSystemProvider', + 'Hateoas': 'pygeoapi.provider.hateoas.HateoasProvider', 'rasterio': 'pygeoapi.provider.rasterio_.RasterioProvider', 'xarray': 'pygeoapi.provider.xarray_.XarrayProvider', 'MVT': 'pygeoapi.provider.mvt.MVTProvider', diff --git a/pygeoapi/provider/filesystem.py b/pygeoapi/provider/filesystem.py index e9c547a..caf4e03 100644 --- a/pygeoapi/provider/filesystem.py +++ b/pygeoapi/provider/filesystem.py @@ -168,6 +168,7 @@ class FileSystemProvider(BaseProvider): 'href': newpath, 'type': 'text/html', 'created': filectime, + 'entry:type': 'Catalog' }) elif os.path.isfile(fullpath): basename, extension = os.path.splitext(dc) @@ -180,7 +181,8 @@ class FileSystemProvider(BaseProvider): 'href': newpath, 'title': get_path_basename(newpath2), 'created': filectime, - 'file:size': filesize + 'file:size': filesize, + 'entry:type': 'Item' }) # child_links.append({ # 'rel': 'item', diff --git a/pygeoapi/provider/hateoas.py b/pygeoapi/provider/hateoas.py new file mode 100644 index 0000000..da38425 --- /dev/null +++ b/pygeoapi/provider/hateoas.py @@ -0,0 +1,205 @@ +# ================================================================= +# +# Authors: yves.choquette +# +# Copyright (c) 2022 Yves Choquette +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +import requests +import logging +import os +from urllib.parse import urljoin +import json + +from pygeoapi.provider.base import (BaseProvider, ProviderNotFoundError) + +LOGGER = logging.getLogger(__name__) + + +class HateoasProvider(BaseProvider): + """HateoasProvider Provider""" + + def __init__(self, provider_def): + """ + Initialize object + + :param provider_def: provider definition + + :returns: pygeoapi.provider.hateoas.HateoasProvider + """ + + super().__init__(provider_def) + + def get_data_path(self, baseurl, urlpath, entrypath): + """ + Gets directory listing or file description or raw file dump + + :param baseurl: base URL of endpoint + :param urlpath: base path of URL + :param entrypath: basepath of the entry selected (equivalent of URL) + + :returns: `dict` of catalogs/collections or `dict` of GeoJSON item + """ + + thispath = os.path.join(baseurl, urlpath) + + resource_type = None + root_link = None + child_links = [] + + data_path = self.data + entrypath + + if '/' not in entrypath: # root + root_link = baseurl + else: + parentpath = urljoin(thispath, '.') + child_links.append({ + 'rel': 'parent', + 'href': '{}?f=json'.format(parentpath), + 'type': 'application/json' + }) + child_links.append({ + 'rel': 'parent', + 'href': parentpath, + 'type': 'text/html' + }) + + depth = entrypath.count('/') + root_path = '/'.replace('/', '../' * depth, 1) + root_link = urljoin(thispath, root_path) + + content = { + 'links': [{ + 'rel': 'root', + 'href': '{}?f=json'.format(root_link), + 'type': 'application/json' + }, { + 'rel': 'root', + 'href': root_link, + 'type': 'text/html' + }, { + 'rel': 'self', + 'href': '{}?f=json'.format(thispath), + 'type': 'application/json', + }, { + 'rel': 'self', + 'href': thispath, + 'type': 'text/html' + } + ] + } + + LOGGER.debug('Checking if path exists as Catalog, Collection or Asset') + try: + jsondata = _get_json_data('{}/catalog.json'.format(data_path)) + resource_type = 'Catalog' + except Exception: + try: + jsondata = _get_json_data('{}/collection.json'.format(data_path)) # noqa + resource_type = 'Collection' + except Exception: + try: + filename = os.path.basename(data_path) + jsondata = _get_json_data('{}/{}.json'.format(data_path, filename)) # noqa + resource_type = 'Assets' + except Exception: + msg = 'Resource does not exist: {}'.format(data_path) + LOGGER.error(msg) + raise ProviderNotFoundError(msg) + + if resource_type == 'Catalog' or resource_type == 'Collection': + content['type'] = resource_type + + link_href_list = [] + for link in jsondata["links"]: + if resource_type in ['Catalog', 'Collection'] \ + and link["rel"] in ["child", "item"]: + link_href_list.append(link["href"].replace('\\', '/')) + link_href_list.sort() + + for link in link_href_list: + unused, path_ending, entry_type = link.split('/') + newpath = os.path.join(baseurl, urlpath, path_ending).replace('\\', '/') # noqa + + if entry_type == 'catalog.json': + child_links.append({ + 'rel': 'child', + 'href': newpath, + 'type': 'text/html', + 'created': "-", + 'entry:type': 'Catalog' + }) + elif entry_type == 'collection.json': + child_links.append({ + 'rel': 'child', + 'href': newpath, + 'type': 'text/html', + 'created': "-", + 'entry:type': 'Collection' + }) + else: + child_links.append({ + 'rel': 'item', + 'href': newpath, + 'title': path_ending, + 'created': "-", + 'entry:type': 'Item' + }) + + elif resource_type == 'Assets': + content = jsondata + content['assets']['default'] = { + 'href': os.path.join(baseurl, urlpath).replace('\\', '/'), + } + + for key in content['assets']: + content['assets'][key]['file:size'] = 0 + content['assets'][key]['created'] = jsondata["properties"]["datetime"] # noqa + + content['links'].extend(child_links) + + return content + + def __repr__(self): + return ' {}'.format(self.data) + + +def _get_json_data(jsonpath): + """ + Helper function used to load a json file that is located on the WEB + (HTTP request) or on the server file system + + :param jsonpath: path to the json file + + :returns: `dict` of JSON item + """ + + if jsonpath[0:4].upper() == 'HTTP': + jsondata = requests.get(jsonpath).json() + else: + with open(jsonpath) as fh: + jsondata = json.load(fh) + + return jsondata diff --git a/pygeoapi/templates/stac/catalog.html b/pygeoapi/templates/stac/catalog.html index 75c6a84..8fa8e03 100644 --- a/pygeoapi/templates/stac/catalog.html +++ b/pygeoapi/templates/stac/catalog.html @@ -10,10 +10,11 @@ {% block body %}