add support for STAC Azure blob storage backend (#1196)

This commit is contained in:
Tom Kralidis
2023-03-30 16:49:16 -04:00
committed by GitHub
parent 0f7a8a122c
commit e8acad7120
7 changed files with 477 additions and 82 deletions
+89 -57
View File
@@ -14,8 +14,93 @@ mosaics. STAC is intentionally designed with a minimal core and flexible extensi
to support a broad set of use cases. This specification has matured over the past several
years, and is used in numerous production deployments.
pygeoapi has two built-in providers to browse STAC catalogs: `FileSystem Provider`_ and
`Hateoas Provider`_.
pygeoapi built-in providers to browse STAC catalogs are described below:
FileSystem Provider
===================
The FileSystem Provider implements STAC as a geospatial file browser through the server's file system,
supporting any level of file/directory nesting/hierarchy.
Configuring STAC in pygeoapi is done by simply pointing the ``data`` provider property
to the given directory and specifying allowed file types:
Connection examples
-------------------
.. code-block:: yaml
my-stac-resource:
type: stac-collection
...
providers:
- type: stac
name: FileSystem
data: /Users/tomkralidis/Dev/data/gdps
file_types:
- .grib2
.. note::
``rasterio`` and ``fiona`` are required for describing geospatial files.
pygeometa metadata control files
--------------------------------
pygeoapi's STAC filesystem fuctionality supports `pygeometa`_ MCF files residing
in the same directory as data files. If an MCF file is found, it will be used
as part of generating the STAC item metadata (e.g. a file named ``birds.csv``
having an associated ``birds.yml`` file). If no MCF file is found, then
pygeometa will generate the STAC item metadata from configuration and by
reading the data's properties.
Publishing ESRI Shapefiles
--------------------------
ESRI Shapefile publishing requires to specify all required component file extensions
(``.shp``, ``.shx``, ``.dbf``) with the provider ``file_types`` option.
Data access examples
--------------------
* STAC root page
* http://localhost:5000/stac
From here, browse the filesystem accordingly.
Azure Blob Storage Provider
===========================
The AzureBlobStorage Provider implements STAC as a geospatial file browser through Azure Blob Storage,
supporting any level of file/directory nesting/hierarchy.
Configuring STAC in pygeoapi is done by simply pointing the ``data`` provider property
to the given container and specifying allowed file types:
Connection examples
-------------------
.. code-block:: yaml
my-stac-resource:
type: stac-collection
...
providers:
- type: stac
name: AzureBlobStorage
data: my-container-name
file_types:
- .grib2
.. note::
The `AZURE_STORAGE_CONNECTION_STRING` environment variable is required and should be set accordingly.
.. note::
``rasterio`` and ``fiona`` are required for describing geospatial files.
Hateoas Provider
================
@@ -181,7 +266,7 @@ Collections are similar to Catalogs with extra fields.
"license": "proprietary"
}
-------------------------------------
**Structure of the Item <id>.json file**
@@ -261,7 +346,7 @@ The example below shows the content of a file named *arcticdem-frontiere-0.json*
"collection": "hrdsm"
}
---------------------
HATEOAS Configuration
---------------------
@@ -292,59 +377,6 @@ Connection examples
data: tests/stac
file_types: catalog.json
-------------------
FileSystem Provider
===================
The FileSystem Provider implements STAC as a geospatial file browser through the server's file system,
supporting any level of file/directory nesting/hierarchy.
Configuring STAC in pygeoapi is done by simply pointing the ``data`` provider property
to the given directory and specifying allowed file types:
Connection examples
-------------------
.. code-block:: yaml
my-stac-resource:
type: stac-collection
...
providers:
- type: stac
name: FileSystem
data: /Users/tomkralidis/Dev/data/gdps
file_types:
- .grib2
.. note::
``rasterio`` and ``fiona`` are required for describing geospatial files.
pygeometa metadata control files
--------------------------------
pygeoapi's STAC filesystem fuctionality supports `pygeometa`_ MCF files residing
in the same directory as data files. If an MCF file is found, it will be used
as part of generating the STAC item metadata (e.g. a file named ``birds.csv``
having an associated ``birds.yml`` file). If no MCF file is found, then
pygeometa will generate the STAC item metadata from configuration and by
reading the data's properties.
Publishing ESRI Shapefiles
--------------------------
ESRI Shapefile publishing requires to specify all required component file extensions
(``.shp``, ``.shx``, ``.dbf``) with the provider ``file_types`` option.
Data access examples
--------------------
* STAC root page
* http://localhost:5000/stac
From here, browse the filesystem accordingly.
.. _`SpatioTemporal Asset Catalog (STAC)`: https://stacspec.org
.. _`pygeometa`: https://geopython.github.io/pygeometa
+11 -10
View File
@@ -2,7 +2,7 @@
#
# Authors: Tom Kralidis <tomkralidis@gmail.com>
#
# Copyright (c) 2022 Tom Kralidis
# Copyright (c) 2023 Tom Kralidis
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
@@ -38,25 +38,26 @@ LOGGER = logging.getLogger(__name__)
#: formatters and processes available
PLUGINS = {
'provider': {
'AzureBlobStorage': 'pygeoapi.provider.azure_.AzureBlobStorageProvider', # noqa
'CSV': 'pygeoapi.provider.csv_.CSVProvider',
'Elasticsearch': 'pygeoapi.provider.elasticsearch_.ElasticsearchProvider', # noqa
'ElasticsearchCatalogue': 'pygeoapi.provider.elasticsearch_.ElasticsearchCatalogueProvider', # noqa
'ESRI': 'pygeoapi.provider.esri.ESRIServiceProvider',
'FileSystem': 'pygeoapi.provider.filesystem.FileSystemProvider',
'GeoJSON': 'pygeoapi.provider.geojson.GeoJSONProvider',
'Hateoas': 'pygeoapi.provider.hateoas.HateoasProvider',
'MapScript': 'pygeoapi.provider.mapscript_.MapScriptProvider',
'MongoDB': 'pygeoapi.provider.mongo.MongoProvider',
'MVT': 'pygeoapi.provider.mvt.MVTProvider',
'OGR': 'pygeoapi.provider.ogr.OGRProvider',
'PostgreSQL': 'pygeoapi.provider.postgresql.PostgreSQLProvider',
'SQLiteGPKG': 'pygeoapi.provider.sqlite.SQLiteGPKGProvider',
'MongoDB': 'pygeoapi.provider.mongo.MongoProvider',
'FileSystem': 'pygeoapi.provider.filesystem.FileSystemProvider',
'Hateoas': 'pygeoapi.provider.hateoas.HateoasProvider',
'rasterio': 'pygeoapi.provider.rasterio_.RasterioProvider',
'xarray': 'pygeoapi.provider.xarray_.XarrayProvider',
'MapScript': 'pygeoapi.provider.mapscript_.MapScriptProvider',
'WMSFacade': 'pygeoapi.provider.wms_facade.WMSFacadeProvider',
'MVT': 'pygeoapi.provider.mvt.MVTProvider',
'TinyDBCatalogue': 'pygeoapi.provider.tinydb_.TinyDBCatalogueProvider',
'SensorThings': 'pygeoapi.provider.sensorthings.SensorThingsProvider',
'SQLiteGPKG': 'pygeoapi.provider.sqlite.SQLiteGPKGProvider',
'Socrata': 'pygeoapi.provider.socrata.SODAServiceProvider',
'TinyDBCatalogue': 'pygeoapi.provider.tinydb_.TinyDBCatalogueProvider',
'WMSFacade': 'pygeoapi.provider.wms_facade.WMSFacadeProvider',
'xarray': 'pygeoapi.provider.xarray_.XarrayProvider',
'xarray-edr': 'pygeoapi.provider.xarray_edr.XarrayEDRProvider'
},
'formatter': {
+361
View File
@@ -0,0 +1,361 @@
# =================================================================
#
# Authors: Tom Kralidis <tomkralidis@gmail.com>
#
# Copyright (c) 2023 Tom Kralidis
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# =================================================================
from datetime import datetime
import logging
import os
from azure.storage.blob import BlobServiceClient
from pygeoapi.provider.base import (BaseProvider, ProviderConnectionError,
ProviderNotFoundError)
from pygeoapi.util import file_modified_iso8601, get_path_basename, url_join
LOGGER = logging.getLogger(__name__)
class AzureBlobStorageProvider(BaseProvider):
"""Azure blob storage Provider"""
def __init__(self, provider_def):
"""
Initialize object
:param provider_def: provider definition
:returns: pygeoapi.provider.filesystem.FileSystemProvider
"""
super().__init__(provider_def)
if os.environ.get('AZURE_STORAGE_CONNECTION_STRING') is None:
msg = 'AZURE_STORAGE_CONNECTION_STRING not set!'
LOGGER.error(msg)
raise ProviderConnectionError()
self.blob_service_client = BlobServiceClient.from_connection_string(
os.environ.get('AZURE_STORAGE_CONNECTION_STRING'))
self.container_client = self.blob_service_client.get_container_client(
self.data)
def get_data_path(self, baseurl, urlpath, dirpath):
"""
Gets directory listing or file description or raw file dump
:param baseurl: base URL of endpoint
:param urlpath: base path of URL
:param dirpath: directory basepath (equivalent of URL)
:returns: `dict` of file listing or `dict` of GeoJSON item or raw file
"""
urlpath = urlpath.split('/')[0]
thispath = os.path.join(baseurl, urlpath)
LOGGER.debug(f'basepath: {baseurl}')
LOGGER.debug(f'urlpath: {urlpath}')
LOGGER.debug(f'path: {thispath}')
resource_type = None
root_link = None
child_links = []
data_path = os.path.join(self.data, dirpath)
data_path = self.data + dirpath
if '/' not in dirpath: # root
root_link = baseurl
else:
parentpath = url_join(thispath, '.')
child_links.append({
'rel': 'parent',
'href': f'{parentpath}?f=json',
'type': 'application/json'
})
child_links.append({
'rel': 'parent',
'href': parentpath,
'type': 'text/html'
})
depth = dirpath.count('/')
root_path = '/'.replace('/', '../' * depth, 1)
root_link = url_join(thispath, root_path)
content = {
'links': [{
'rel': 'root',
'href': f'{root_link}?f=json',
'type': 'application/json'
}, {
'rel': 'root',
'href': root_link,
'type': 'text/html'
}, {
'rel': 'self',
'href': f'{thispath}?f=json',
'type': 'application/json',
}, {
'rel': 'self',
'href': thispath,
'type': 'text/html'
}
]
}
LOGGER.debug(f'data path: {data_path}')
data_path = data_path.replace(self.data, '').lstrip('/')
LOGGER.debug(f'data path: {data_path}')
if data_path == '':
LOGGER.debug('Root of container')
self.blob_client = self.blob_service_client.get_blob_client(
container=self.data, blob=data_path+'/')
LOGGER.debug('Checking if path exists as raw file or directory')
if data_path.endswith(tuple(self.file_types)):
resource_type = 'raw_file'
elif self.container_client.walk_blobs(name_starts_with=data_path, prefix='/') or data_path == '': # noqa
resource_type = 'directory'
LOGGER.debug('Checking if path exists as file via file_types')
for ft in self.file_types:
tmp_path = f'{data_path}{ft}'
blob_tmp_path = self.blob_service_client.get_blob_client(
container=self.data.lstrip('/'), blob=tmp_path)
if blob_tmp_path.exists():
resource_type = 'file'
data_path = tmp_path
break
LOGGER.debug(f'Resource type: {resource_type}')
if resource_type is None:
msg = f'Resource does not exist: {data_path}'
LOGGER.error(msg)
raise ProviderNotFoundError(msg)
if resource_type == 'raw_file':
data = self.blob_service_client.get_blob_client(
container=self.data.lstrip('/'), blob=data_path)
return data.download_blob().read()
elif resource_type == 'directory':
content['type'] = 'Catalog'
LOGGER.debug(f'DATA PATH: {data_path}')
for dc in self.container_client.walk_blobs(
name_starts_with=data_path, prefix='/'):
fullpath = dc.name
LOGGER.debug(f'FULLPATH: {fullpath}')
if fullpath.endswith('/'):
newpath = os.path.join(baseurl, urlpath, str(dc.name))
child_links.append({
'rel': 'child',
'href': newpath,
'type': 'text/html',
'entry:type': 'Catalog'
})
else:
basename, extension = os.path.splitext(dc.name)
newpath = os.path.join(baseurl, urlpath, basename)
newpath2 = f'{newpath}{extension}'
if extension in self.file_types:
fullpath = os.path.join(data_path, dc.name)
child_links.append({
'rel': 'item',
'href': newpath,
'title': get_path_basename(newpath2),
'created': dc.creation_time,
'file:size': dc.size,
'entry:type': 'Item'
})
elif resource_type == 'file':
blob_tmp_path = self.blob_service_client.get_blob_client(
container=self.data.lstrip('/'), blob=tmp_path)
blob_properties = blob_tmp_path.get_blob_properties()
filename = os.path.basename(data_path)
id_ = os.path.splitext(filename)[0]
if urlpath:
filename = filename.replace(id_, '')
url = f'{baseurl}/{urlpath}/{tmp_path}'
filectime = blob_properties.creation_time
filesize = blob_properties.size
content = {
'id': id_,
'type': 'Feature',
'properties': {},
'links': [],
'assets': {}
}
content.update(_describe_file(blob_tmp_path.download_blob()))
content['assets']['default'] = {
'href': url,
'created': filectime,
'file:size': filesize
}
content['links'].extend(child_links)
return content
def __repr__(self):
return f'<FileSystemProvider> {self.data}'
def _describe_file(filepath):
"""
Helper function to describe geospatial data
Parse file using rasterio/fiona to retrieve properties
:param filepath: path to file
:returns: `dict` of GeoJSON item
"""
content = {
'bbox': None,
'geometry': None,
'properties': {}
}
if content['geometry'] is None and content['bbox'] is None:
try:
import rasterio
from rasterio.crs import CRS
from rasterio.io import MemoryFile
from rasterio.warp import transform_bounds
except ImportError as err:
LOGGER.warning('rasterio not found')
LOGGER.warning(err)
return content
try:
import fiona
except ImportError as err:
LOGGER.warning('fiona not found')
LOGGER.warning(err)
return content
try: # raster
LOGGER.debug('Testing raster data detection')
with MemoryFile(filepath) as memfile:
with memfile.open() as d:
scrs = CRS(d.crs)
if scrs.to_epsg() not in [None, 4326]:
tcrs = CRS.from_epsg(4326)
bnds = transform_bounds(scrs, tcrs,
d.bounds[0], d.bounds[1],
d.bounds[2], d.bounds[3])
content['properties']['projection'] = scrs.to_epsg()
else:
bnds = [d.bounds.left, d.bounds.bottom,
d.bounds.right, d.bounds.top]
content['bbox'] = bnds
content['geometry'] = {
'type': 'Polygon',
'coordinates': [[
[bnds[0], bnds[1]],
[bnds[0], bnds[3]],
[bnds[2], bnds[3]],
[bnds[2], bnds[1]],
[bnds[0], bnds[1]]
]]
}
for k, v in d.tags(d.count).items():
content['properties'][k] = v
if k in ['GRIB_REF_TIME']:
value = int(v.split()[0])
datetime_ = datetime.fromtimestamp(value)
content['properties']['datetime'] = datetime_.isoformat() + 'Z' # noqa
except rasterio.errors.RasterioIOError as err:
LOGGER.debug(err)
try:
LOGGER.debug('Testing vector data detection')
d = fiona.open(filepath)
scrs = CRS(d.crs)
if scrs.to_epsg() not in [None, 4326]:
tcrs = CRS.from_epsg(4326)
bnds = transform_bounds(scrs, tcrs,
d.bounds[0], d.bounds[1],
d.bounds[2], d.bounds[3])
content['properties']['projection'] = scrs.to_epsg()
else:
bnds = d.bounds
if d.schema['geometry'] not in [None, 'None']:
content['bbox'] = [
bnds[0],
bnds[1],
bnds[2],
bnds[3]
]
content['geometry'] = {
'type': 'Polygon',
'coordinates': [[
[bnds[0], bnds[1]],
[bnds[0], bnds[3]],
[bnds[2], bnds[3]],
[bnds[2], bnds[1]],
[bnds[0], bnds[1]]
]]
}
for k, v in d.schema['properties'].items():
content['properties'][k] = v
if d.driver == 'ESRI Shapefile':
id_ = os.path.splitext(os.path.basename(filepath))[0]
content['assets'] = {}
for suffix in ['shx', 'dbf', 'prj']:
fullpath = f'{os.path.splitext(filepath)[0]}.{suffix}'
if os.path.exists(fullpath):
filectime = file_modified_iso8601(fullpath)
filesize = os.path.getsize(fullpath)
content['assets'][suffix] = {
'href': f'./{id_}.{suffix}',
'created': filectime,
'file:size': filesize
}
except fiona.errors.DriverError:
LOGGER.debug('Could not detect raster or vector data')
return content
+4 -5
View File
@@ -2,7 +2,7 @@
#
# Authors: Tom Kralidis <tomkralidis@gmail.com>
#
# Copyright (c) 2022 Tom Kralidis
# Copyright (c) 2023 Tom Kralidis
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
@@ -32,11 +32,10 @@ import io
from json import loads
import logging
import os
from urllib.parse import urljoin
from pygeoapi.provider.base import (BaseProvider, ProviderConnectionError,
ProviderNotFoundError)
from pygeoapi.util import file_modified_iso8601, get_path_basename
from pygeoapi.util import file_modified_iso8601, get_path_basename, url_join
LOGGER = logging.getLogger(__name__)
@@ -83,7 +82,7 @@ class FileSystemProvider(BaseProvider):
if '/' not in dirpath: # root
root_link = baseurl
else:
parentpath = urljoin(thispath, '.')
parentpath = url_join(thispath, '.')
child_links.append({
'rel': 'parent',
'href': f'{parentpath}?f=json',
@@ -97,7 +96,7 @@ class FileSystemProvider(BaseProvider):
depth = dirpath.count('/')
root_path = '/'.replace('/', '../' * depth, 1)
root_link = urljoin(thispath, root_path)
root_link = url_join(thispath, root_path)
content = {
'links': [{
+4 -3
View File
@@ -3,6 +3,7 @@
# Authors: yves.choquette <yves.choquette@NRCan-RNCan.gc.ca>
#
# Copyright (c) 2022 Yves Choquette
# Copyright (c) 2023 Tom Kralidis
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
@@ -30,10 +31,10 @@
import requests
import logging
import os
from urllib.parse import urljoin
import json
from pygeoapi.provider.base import (BaseProvider, ProviderNotFoundError)
from pygeoapi.util import url_join
LOGGER = logging.getLogger(__name__)
@@ -74,7 +75,7 @@ class HateoasProvider(BaseProvider):
if '/' not in entrypath: # root
root_link = baseurl
else:
parentpath = urljoin(thispath, '.')
parentpath = url_join(thispath, '.')
child_links.append({
'rel': 'parent',
'href': f'{parentpath}?f=json',
@@ -88,7 +89,7 @@ class HateoasProvider(BaseProvider):
depth = entrypath.count('/')
root_path = '/'.replace('/', '../' * depth, 1)
root_link = urljoin(thispath, root_path)
root_link = url_join(thispath, root_path)
content = {
'links': [{
+6 -7
View File
@@ -4,7 +4,7 @@
# Authors: Tom Kralidis <tomkralidis@gmail.com>
#
# Copyright (c) 2020 Francesco Bartoli
# Copyright (c) 2022 Tom Kralidis
# Copyright (c) 2023 Tom Kralidis
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
@@ -33,9 +33,8 @@ import json
import logging
import requests
from pathlib import Path
from urllib.parse import urlparse, urljoin
from urllib.parse import urlparse
from pygeoapi.util import is_url, url_join
from pygeoapi.provider.tile import (
BaseTileProvider, ProviderTileNotFoundError)
from pygeoapi.provider.base import ProviderConnectionError
@@ -43,6 +42,7 @@ from pygeoapi.models.provider.base import (
TileMatrixSetEnum, TilesMetadataFormat, TileSetMetadata, LinkType,
GeospatialDataType)
from pygeoapi.models.provider.mvt import MVTTilesJson
from pygeoapi.util import is_url, url_join
LOGGER = logging.getLogger(__name__)
@@ -74,7 +74,7 @@ class MVTProvider(BaseTileProvider):
self._service_url = url_join(baseurl, servicepath)
self._service_metadata_url = urljoin(
self._service_metadata_url = url_join(
self.service_url.split('{tileMatrix}/{tileRow}/{tileCol}')[0],
'metadata')
else:
@@ -163,8 +163,7 @@ class MVTProvider(BaseTileProvider):
self._service_url = url_join(baseurl, servicepath)
tile_matrix_set = self.service_url.split(
'/{tileMatrix}/{tileRow}/{tileCol}')[0]
self._service_metadata_url = urljoin(
tile_matrix_set, 'metadata')
self._service_metadata_url = url_join(tile_matrix_set, 'metadata')
links = {
'links': [
{
@@ -270,7 +269,7 @@ class MVTProvider(BaseTileProvider):
with open(self.service_metadata_url, 'r') as md_file:
metadata_json_content = json.loads(md_file.read())
service_url = urljoin(
service_url = url_join(
server_url,
f'collections/{dataset}/tiles/{tileset}/{{tileMatrix}}/{{tileRow}}/{{tileCol}}?f=mvt') # noqa
+2
View File
@@ -1,3 +1,5 @@
azure-identity
azure-storage-blob
elasticsearch
elasticsearch-dsl
fiona