From 91051e5df07ed0c53ec7ddc71774f6bb7aa38826 Mon Sep 17 00:00:00 2001 From: Sander Schaminee Date: Wed, 15 Mar 2023 14:38:34 +0100 Subject: [PATCH] Add enclosure link prefetcher and tests (#1169) (#1173) --- docs/source/configuration.rst | 64 +++++++- pygeoapi/api.py | 23 ++- .../schemas/config/pygeoapi-config-0.x.yml | 3 + pygeoapi/util.py | 27 ++++ tests/pygeoapi-test-config-enclosure.yml | 140 ++++++++++++++++++ tests/test_api.py | 44 ++++++ tests/test_util.py | 18 +++ 7 files changed, 313 insertions(+), 6 deletions(-) create mode 100644 tests/pygeoapi-test-config-enclosure.yml diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 37b5208..93f3b84 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -179,7 +179,7 @@ default. - type: feature # underlying data geospatial type: (allowed values are: feature, coverage, record, tile, edr) default: true # optional: if not specified, the first provider definition is considered the default name: CSV - # transactions: DO NOT ACTIVATE unless you have setup access contol beyond pygeoapi + # transactions: DO NOT ACTIVATE unless you have setup access control beyond pygeoapi editable: true # optional: if backend is writable, default is false data: tests/data/obs.csv # required: the data filesystem path or URL, depending on plugin setup id_field: id # required for vector data, the field corresponding to the ID @@ -198,7 +198,7 @@ default. hello-world: # name of process type: collection # REQUIRED (collection, process, or stac-collection) processor: - name: HelloWorld # Python path of process defition + name: HelloWorld # Python path of process definition .. seealso:: @@ -207,6 +207,60 @@ default. .. seealso:: :ref:`plugins` for more information on plugins +Adding links to collections +--------------------------- + +You can add any type of link to a resource of type `collection`. +pygeoapi does not enforce anything here, as long as the link has a `type`, `rel`, and `href` parameter. +The `type` parameter defines the MIME type (`Content-Type`) of the linked resource. +The `rel` parameter tell something about what kind of link it is. You could set this to `license` to +add a data license link, or to `describedBy` if you wish to add a schema definition for example. + +It's also possible to add (bulk) download links to a collection. +These links should have their `rel` parameter set to `enclosure` and must have a `length` parameter +that defines the content length (byte size) of the file. +If you know the content length and it never changes, you can set this and pygeoapi will return the enclosure link(s) as-is. + +However, the downloadable resource may be subject to change (e.g. it may grow in size over time). +In that case, you can omit the `length` and pygeoapi will figure out the actual `Content-Length` header +by issuing a `HEAD` request on the given URL (`href` parameter). +Furthermore, if it notices that the defined `type` (MIME type) of the link does not match the actual +`Content-Type` in the response headers, it will automatically update the `type` accordingly. +Note that `type` is a mandatory link parameter though, so you must always set it. + +So for example, you could define a download link like so: + +.. code-block:: yaml + + links + - type: application/octet-stream # must have some MIME type + rel: enclosure + title: download link + href: https://myserver.com/data/file.zip # URL + +And pygeoapi will turn that into: + +.. code-block:: json + + { + "links": { + "type": "application/zip", + "rel": "enclosure", + "title": "download link", + "href": "https://myserver.com/data/file.zip", + "length": 46435 + } + } + +Note how the MIME type was updated to match the actual `Content-Type` and that the `length` was set +according to the `Content-Length` header. + +.. note:: + + If the `length` parameter is omitted and pygeoapi was not able to verify the `Content-Length` within 1 second + and/or within 1 URL redirect, the enclosure link will **not** be included in the response. + This means that if you want to be sure that the link is always included, you will have to set a `length`. + Publishing hidden resources --------------------------- @@ -396,7 +450,7 @@ one with terms defined by schema.org: linked-data: context: - schema: https://schema.org/ - stn_id: schema:identifer + stn_id: schema:identifier datetime: "@id": schema:observationDate "@type": schema:DateTime @@ -418,7 +472,7 @@ An example of a data provider that includes relationships between items is the S SensorThings API, by default, has relationships between entities within its data model. Setting the ``intralink`` field of the SensorThings provider to ``true`` sets pygeoapi to represent the relationship between configured entities as intra-pygeoapi links or URIs. -This relationship can further be maintained in the JSON-LD structured data using the appropiate +This relationship can further be maintained in the JSON-LD structured data using the appropriate ``@context`` with the sosa/ssn ontology. For example: .. code-block:: yaml @@ -446,7 +500,7 @@ This relationship can further be maintained in the JSON-LD structured data using Datastream: sosa:isMemberOf Sometimes, the JSON-LD desired for an individual feature in a collection is more complicated than can be achieved by -aliasing properties using a context. In thise case, it is possible to specify a Jinja2 template. When ``item_template`` +aliasing properties using a context. In this case, it is possible to specify a Jinja2 template. When ``item_template`` is defined for a feature collection, the json-ld prepared by pygeoapi will be used to render the Jinja2 template specified by the path. The path specified can be absolute or relative to pygeoapi's template folder. For even more deployment flexibility, the path can be specified with string interpolation of environment variables. diff --git a/pygeoapi/api.py b/pygeoapi/api.py index 32dad10..9fc86cd 100644 --- a/pygeoapi/api.py +++ b/pygeoapi/api.py @@ -75,7 +75,7 @@ from pygeoapi.provider.tile import (ProviderTileNotFoundError, ProviderTileQueryError, ProviderTilesetIdNotFoundError) from pygeoapi.models.cql import CQLModel -from pygeoapi.util import (dategetter, DATETIME_FORMAT, +from pygeoapi.util import (dategetter, DATETIME_FORMAT, UrlPrefetcher, filter_dict_by_key_value, get_provider_by_type, get_provider_default, get_typed_value, JobStatus, json_serial, render_j2_template, str2bool, @@ -625,6 +625,7 @@ class API: self.config = config self.config['server']['url'] = self.config['server']['url'].rstrip('/') + self.prefetcher = UrlPrefetcher() CHARSET[0] = config['server'].get('encoding', 'utf-8') if config['server'].get('gzip'): @@ -921,6 +922,7 @@ class API: if 'trs' in t_ext: collection['extent']['temporal']['trs'] = t_ext['trs'] + LOGGER.debug('Processing configured collection links') for link in l10n.translate(v['links'], request.locale): lnk = { 'type': link['type'], @@ -931,6 +933,25 @@ class API: if 'hreflang' in link: lnk['hreflang'] = l10n.translate( link['hreflang'], request.locale) + content_length = link.get('length', 0) + + if lnk['rel'] == 'enclosure' and content_length == 0: + # Issue HEAD request for enclosure links without length + lnk_headers = self.prefetcher.get_headers(lnk['href']) + content_length = int(lnk_headers.get('content-length', 0)) + content_type = lnk_headers.get('content-type', lnk['type']) + if content_length == 0: + # Skip this (broken) link + LOGGER.debug(f"Enclosure {lnk['href']} is invalid") + continue + if content_type != lnk['type']: + # Update content type if different from specified + lnk['type'] = content_type + LOGGER.debug( + f"Fixed media type for enclosure {lnk['href']}") + + if content_length > 0: + lnk['length'] = content_length collection['links'].append(lnk) diff --git a/pygeoapi/schemas/config/pygeoapi-config-0.x.yml b/pygeoapi/schemas/config/pygeoapi-config-0.x.yml index 23d2a13..0abcdc7 100644 --- a/pygeoapi/schemas/config/pygeoapi-config-0.x.yml +++ b/pygeoapi/schemas/config/pygeoapi-config-0.x.yml @@ -312,6 +312,9 @@ properties: hreflang: type: string description: language + length: + type: integer + description: optional content size in bytes (e.g. for download links) required: - type - rel diff --git a/pygeoapi/util.py b/pygeoapi/util.py index 2ce41c0..62eed57 100644 --- a/pygeoapi/util.py +++ b/pygeoapi/util.py @@ -50,6 +50,8 @@ import dateutil.parser from jinja2 import Environment, FileSystemLoader, select_autoescape from babel.support import Translations import yaml +from requests import Session +from requests.structures import CaseInsensitiveDict from pygeoapi import __version__ from pygeoapi import l10n @@ -537,3 +539,28 @@ def get_envelope(coords_list: List[List[float]]) -> list: bounds = polygon.bounds return [[bounds[0], bounds[3]], [bounds[2], bounds[1]]] + + +class UrlPrefetcher: + """ Prefetcher to get HTTP headers for specific URLs. + Allows a maximum of 1 redirect by default. + """ + def __init__(self): + self._session = Session() + self._session.max_redirects = 1 + + def get_headers(self, url: str, **kwargs) -> CaseInsensitiveDict: + """ Issues an HTTP HEAD request to the given URL. + Returns a case-insensitive dictionary of all headers. + If the request times out (defaults to 1 second unless `timeout` + keyword argument is set), or the response has a bad status code, + an empty dictionary is returned. + """ + kwargs.setdefault('timeout', 1) + kwargs.setdefault('allow_redirects', True) + try: + response = self._session.head(url, **kwargs) + response.raise_for_status() + except Exception: # noqa + return CaseInsensitiveDict() + return response.headers diff --git a/tests/pygeoapi-test-config-enclosure.yml b/tests/pygeoapi-test-config-enclosure.yml new file mode 100644 index 0000000..e401162 --- /dev/null +++ b/tests/pygeoapi-test-config-enclosure.yml @@ -0,0 +1,140 @@ +# ================================================================= +# +# Authors: Tom Kralidis +# +# Copyright (c) 2019 Tom Kralidis +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +server: + bind: + host: 0.0.0.0 + port: 5000 + url: http://localhost:5000/ + mimetype: application/json; charset=UTF-8 + encoding: utf-8 + gzip: false + languages: + # First language is the default language + - en-US + - fr-CA + cors: true + pretty_print: true + limit: 10 + # templates: /path/to/templates + map: + url: https://maps.wikimedia.org/osm-intl/{z}/{x}/{y}.png + attribution: 'Wikimedia maps | Map data © OpenStreetMap contributors' + manager: + name: TinyDB + connection: /tmp/pygeoapi-test-process-manager.db + output_dir: /tmp + +logging: + level: DEBUG + #logfile: /tmp/pygeoapi.log + +metadata: + identification: + title: + en: pygeoapi default instance + fr: instance par défaut de pygeoapi + description: + en: pygeoapi provides an API to geospatial data + fr: pygeoapi fournit une API aux données géospatiales + keywords: + en: + - geospatial + - data + - api + fr: + - géospatiale + - données + - api + keywords_type: theme + terms_of_service: https://creativecommons.org/licenses/by/4.0/ + url: http://example.org + license: + name: CC-BY 4.0 license + url: https://creativecommons.org/licenses/by/4.0/ + provider: + name: Organization Name + url: https://pygeoapi.io + contact: + name: Lastname, Firstname + position: Position Title + address: Mailing Address + city: City + stateorprovince: Administrative Area + postalcode: Zip or Postal Code + country: Country + phone: +xx-xxx-xxx-xxxx + fax: +xx-xxx-xxx-xxxx + email: you@example.org + url: Contact URL + hours: Hours of Service + instructions: During hours of service. Off on weekends. + role: pointOfContact + +resources: + objects: + type: collection + title: GeoJSON objects + description: GeoJSON geometry types for GeoSparql and Schema Geometry conversion. + keywords: + - shapes + links: + - type: text/html + rel: canonical + title: data source + href: https://en.wikipedia.org/wiki/GeoJSON + hreflang: en-US + - type: application/xml + rel: enclosure + title: download link 1 + href: https://github.com/geopython/pygeoapi/raw/4a18393662583e53b8c7d591130246d9cd2c3f3f/pygeoapi/static/img/pygeoapi.png + length: 10000 + - type: image/png + rel: enclosure + title: download link 2 + href: https://github.com/geopython/pygeoapi/raw/4a18393662583e53b8c7d591130246d9cd2c3f3f/pygeoapi/static/img/pygeoapi.png + - type: image/jpg + rel: enclosure + title: download link 3 + href: https://github.com/geopython/pygeoapi/raw/4a18393662583e53b8c7d591130246d9cd2c3f3f/pygeoapi/static/img/pygeoapi.png + linked-data: + item_template: tests/data/base.jsonld + extents: + spatial: + bbox: [-180,-90,180,90] + crs: http://www.opengis.net/def/crs/OGC/1.3/CRS84 + temporal: + begin: null + end: null # or empty (either means open ended) + providers: + - type: feature + name: GeoJSON + data: tests/data/items.geojson + id_field: fid + uri_field: uri diff --git a/tests/test_api.py b/tests/test_api.py index eed75c7..3974f59 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -55,6 +55,13 @@ def config(): return yaml_load(fh) +@pytest.fixture() +def config_enclosure() -> dict: + """ Returns a pygeoapi configuration with enclosure links. """ + with open(get_test_file_path('pygeoapi-test-config-enclosure.yml')) as fh: + return yaml_load(fh) + + @pytest.fixture() def config_hidden_resources(): filename = 'pygeoapi-test-config-hidden-resources.yml' @@ -73,6 +80,12 @@ def api_(config): return API(config) +@pytest.fixture() +def enclosure_api(config_enclosure): + """ Returns an API instance with a collection with enclosure links. """ + return API(config_enclosure) + + @pytest.fixture() def api_hidden_resources(config_hidden_resources): return API(config_hidden_resources) @@ -860,6 +873,37 @@ def test_get_collection_items(config, api_): assert code == HTTPStatus.BAD_REQUEST +def test_describe_collections_enclosures(config_enclosure, enclosure_api): + original_enclosures = { + lnk['title']: lnk + for lnk in config_enclosure['resources']['objects']['links'] + if lnk['rel'] == 'enclosure' + } + + req = mock_request() + _, _, response = enclosure_api.describe_collections(req, 'objects') + features = json.loads(response) + modified_enclosures = { + lnk['title']: lnk for lnk in features['links'] + if lnk['rel'] == 'enclosure' + } + + # If type and length is set, do not verify/update link + assert original_enclosures['download link 1'] == \ + modified_enclosures['download link 1'] + # If length is missing, modify link type and length + assert original_enclosures['download link 2']['type'] == \ + modified_enclosures['download link 2']['type'] + assert modified_enclosures['download link 2']['type'] == \ + modified_enclosures['download link 3']['type'] + assert 'length' not in original_enclosures['download link 2'] + assert modified_enclosures['download link 2']['length'] > 0 + assert modified_enclosures['download link 2']['length'] == \ + modified_enclosures['download link 3']['length'] + assert original_enclosures['download link 3']['type'] != \ + modified_enclosures['download link 3']['type'] + + def test_get_collection_items_json_ld(config, api_): req = mock_request({ 'f': 'jsonld', diff --git a/tests/test_util.py b/tests/test_util.py index ef3ed39..f40d8ab 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -156,3 +156,21 @@ def test_read_data(): data = util.read_data(get_test_file_path('pygeoapi-test-config.yml')) assert isinstance(data, bytes) + + +def test_prefetcher(): + prefetcher = util.UrlPrefetcher() + assert prefetcher.get_headers('bad_url') == {} + # URL below will redirect once + url = 'https://github.com/geopython/pygeoapi/raw/4a18393662583e53b8c7d591130246d9cd2c3f3f/pygeoapi/static/img/pygeoapi.png' # noqa + headers = prefetcher.get_headers(url) + length = int(headers.get('content-length', 0)) + assert length > 0 + # Test without redirect + headers = prefetcher.get_headers(url, allow_redirects=False) + assert headers.get('content-length') in (0, '0', None) + assert headers.get('content-type') != 'image/png' + # Test using redirect location from header + headers = prefetcher.get_headers(headers['location']) + assert int(headers.get('content-length', 0)) == length + assert headers.get('content-type') == 'image/png'