Source code for uk_covid19.api_interface

#!/usr/bin python3

# Imports
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Python:
from typing import Iterable, Dict, Union, Iterator
from json import dumps
from http import HTTPStatus
from datetime import datetime
from xml.etree.ElementTree import Element as XMLElement

# 3rd party:
from requests import request, Response
import certifi

# Internal:
from uk_covid19.utils import save_data
from uk_covid19.data_format import DataFormat
from uk_covid19.exceptions import FailedRequestError

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

__all__ = [
    'Cov19API'
]


StructureType = Dict[str, Union[dict, str]]
FiltersType = Iterable[str]


[docs]class Cov19API:
    """
    Interface to access the API service for COVID-19 data in the United Kingdom.

    Parameters
    ----------
    filters: Iterable[str]
        API filters. See the API documentations for additional
        information.

    structure: Dict[str, Union[dict, str]]
        Structure parameter. See the API documentations for
        additional information.

    latest_by: Union[str, None]
        Retrieves the latest value for a specific metric. [Default: ``None``]
    """
    endpoint = "https://api.coronavirus.data.gov.uk/v1/data"
    release_timestamp_endpoint = "https://api.coronavirus.data.gov.uk/v1/timestamp"

    _last_update: Union[str, None] = None
    _total_pages: Union[int, None] = None

    def __init__(self, filters: FiltersType, structure: StructureType,
                 latest_by: Union[str, None] = None):
        self.filters = filters

        if any(isinstance(value, (list, dict)) for value in structure):
            raise TypeError(
                "Nested structures are no longer supported. Please define a flat "
                "structure instead."
            )

        self.structure = structure
        self.latest_by = latest_by

    @property
    def total_pages(self) -> Union[int, None]:
        """
        :property:
            Produces the total number of pages for a given set of
            parameters (only after the data are requested).

        Returns
        -------
        Union[int, None]
        """
        return self._total_pages

    @property
    def last_update(self) -> str:
        """
        :property:
            Produces the timestamp for the last update in GMT.

        This property supplies the API time - i.e. the time at which the data were
        deployed to the database. Please note that there will always be a difference
        between this time and the timestamp that is displayed on the website, which may
        be accessed via the ``.get_release_timestamp()`` method. The website timestamp
        signifies the time at which the data were release to the API, and by extension
        the website.

        .. note::

            The output is extracted from the header and is accurate to
            the second.
            
        .. warning::

            The ISO-8601 standard requires a ``"Z"`` character to be added
            to the end of the timestamp. This is a timezone feature and is
            not recognised by Python's ``datetime`` library. It is, however,
            most other libraries; e.g. ``pandas``. If you wish to parse the
            timestamp using the the ``datetime`` library, make sure that you
            remove the trailing ``"Z"`` character.

        Returns
        -------
        str
            Timestamp, formatted as ISO-8601.

        Examples
        --------
        >>> filters = ["areaType=region"]
        >>> structure = {
        ...     "name": "areaName",
        ...     "newCases": "newCasesBySpecimenDate"
        ... }
        >>> data = Cov19API(
        ...     filters=filters,
        ...     structure=structure,
        ...     latest_by='newCasesBySpecimenDate'
        ... )
        >>> timestamp = data.last_update
        >>> print(timestamp)
        2020-07-27T20:29:16.000000Z

        >>> from datetime import datetime
        >>> parsed_timestamp = datetime.fromisoformat(timestamp.strip("Z"))
        >>> print(parsed_timestamp)
        2020-07-27 20:29:16
        """
        if self._last_update is None:
            self._last_update = self.head()['Last-Modified']

        timestamp = datetime.strptime(self._last_update, "%a, %d %b %Y %H:%M:%S GMT")

        return timestamp.isoformat() + ".000000Z"

[docs]    @staticmethod
    def get_release_timestamp() -> str:
        """
        :staticmethod:
            Produces the website timestamp in GMT.

        .. versionadded:: 1.2.0

        This property supplies the website timestamp - i.e. the time at which the data
        were released to the API and by extension the website. Please note that there
        will be a difference between this timestamp and the timestamp produced using
        the ``last_update`` property. The latter signifies the time at which the data
        were deployed to the database, not the time at which they were released.

        .. note::

            The output is extracted from the header and is accurate to
            the miliseconds.

        .. warning::

            The ISO-8601 standard requires a ``"Z"`` character to be added
            to the end of the timestamp. This is a timezone feature and is
            not recognised by Python's ``datetime`` library. It is, however,
            most other libraries; e.g. ``pandas``. If you wish to parse the
            timestamp using the the ``datetime`` library, make sure that you
            remove the trailing ``"Z"`` character.
            
        Returns
        -------
        str
            Timestamp, formatted as ISO-8601.

        Examples
        --------
        >>> release_timestamp = Cov19API.get_release_timestamp()
        >>> print(release_timestamp)
        2020-08-08T15:00:09.977840Z

        >>> from datetime import datetime
        >>> release_timestamp = Cov19API.get_release_timestamp()
        >>> parsed_timestamp = datetime.fromisoformat(release_timestamp.strip("Z"))
        >>> print(parsed_timestamp)
        2020-08-08 15:00:09
        """
        with request("GET", Cov19API.release_timestamp_endpoint) as response:
            json_data = response.json()

        return json_data['websiteTimestamp']

    @property
    def api_params(self) -> dict:
        """
        :staticmethod:
            API parameters, constructed based on ``filters``, ``structure``,
            and ``latest_by`` arguments as defined by the user.

        Returns
        -------
        Dict[str, str]
        """
        api_params = {
            "filters": str.join(";", self.filters),
            "structure": dumps(self.structure, separators=(",", ":")),
        }

        if self.latest_by is not None:
            api_params.update({
                "latestBy": self.latest_by
            })

        return api_params

[docs]    def head(self):
        """
        Request header for the given input arguments (``filters``,
        ``structure``, and ``lastest_by``).

        Returns
        -------
        Dict[str, str]

        Examples
        --------
        >>> filters = ["areaType=region"]
        >>> structure = {
        ...     "name": "areaName",
        ...     "newCases": "newCasesBySpecimenDate"
        ... }
        >>> data = Cov19API(
        ...     filters=filters,
        ...     structure=structure,
        ...     latest_by='newCasesBySpecimenDate'
        ... )
        >>> head = data.head()
        >>> print(head)
        {'Cache-Control': 'public, max-age=60', 'Content-Length': '0',
         ...
        }
        """
        api_params = self.api_params

        with request("HEAD", self.endpoint, params=api_params,
                     verify=certifi.where()) as response:
            response.raise_for_status()
            return response.headers

[docs]    @staticmethod
    def options():
        """
        :staticmethod:
            Provides the options by calling the ``OPTIONS`` method of the API.

        Returns
        -------
        dict
            API options.

        Examples
        --------
        >>> from pprint import pprint
        >>> options = Cov19API.options()
        >>> pprint(options)
        {'info': {'description': "Public Health England's Coronavirus Dashboard API",
         'title': 'Dashboard API',
         'version': '1.0'},
         'openapi': '3.0.1',
          ...
        }
        """
        with request("OPTIONS", Cov19API.endpoint, verify=certifi.where()) as response:
            response.raise_for_status()
            return response.json()

    def _get(self, format_as: DataFormat) -> Iterator[Response]:
        """
        Extracts paginated data by requesting all of the pages
        and combining the results.

        Parameters
        ----------
        format_as: str
            Response format.

        Returns
        -------
        Iterator[Response]

        Raises
        ------
        FailedRequestError
            When the request fails.
        """
        api_params = self.api_params

        api_params.update({
            "format": format_as.value,
            "page": 1
        })

        if self.latest_by is not None:
            del api_params["page"]

        while True:
            with request("GET", self.endpoint, params=api_params,
                         verify=certifi.where()) as response:
                if response.status_code >= HTTPStatus.BAD_REQUEST:
                    raise FailedRequestError(response=response, params=api_params)

                if self.latest_by is not None:
                    yield response
                    break
                elif response.status_code == HTTPStatus.NO_CONTENT:
                    self._total_pages = api_params["page"] - 1
                    break
                else:
                    self._last_update = response.headers["Last-Modified"]
                    yield response

            if self.latest_by is None:
                api_params["page"] += 1

[docs]    def get_json(self, save_as: Union[str, None] = None,
                 as_string: bool = False) -> Union[dict, str]:
        """
        Provides full data (all pages) in JSON.

        Parameters
        ----------
        save_as: Union[str, None]
            If defined, the results will (also) be saved as a
            file. [Default: ``None``]

            The value must be a path to a file with the correct
            extension -- i.e. ``.json`` for JSON).

        as_string: bool
            .. versionadded:: 1.1.4

            If ``False`` (default), returns the data as a dictionary.
            Otherwise, returns the data as a JSON string.

        Returns
        -------
        Union[Dict, str]

        Examples
        --------
        >>> filters = ["areaType=region"]
        >>> structure = {
        ...     "name": "areaName",
        ...     "newCases": "newCasesBySpecimenDate"
        ... }
        >>> data = Cov19API(
        ...     filters=filters,
        ...     structure=structure,
        ...     latest_by='newCasesBySpecimenDate'
        ... )
        >>> result = data.get_json()
        >>> print(result)
        {'data': [{'name': 'East Midlands', 'newCases': 0}, ... }
        """
        resp = {
            "data": list()
        }

        for response in self._get(DataFormat.JSON):
            current_data = response.json()
            page_data = current_data['data']

            resp["data"].extend(page_data)

        resp["lastUpdate"] = self.last_update
        resp["length"] = len(resp["data"])
        resp["totalPages"] = self._total_pages

        if as_string:
            return dumps(resp, separators=(",", ":"))

        if save_as is None:
            return resp

        data = dumps(resp, separators=(",", ":"))
        save_data(data, save_as, DataFormat.JSON)

        return resp

[docs]    def get_xml(self, save_as=None, as_string=False) -> XMLElement:
        """
        Provides full data (all pages) in XML.

        Parameters
        ----------
        save_as: Union[str, None]
            If defined, the results will (also) be saved as a
            file. [Default: ``None``]

            The value must be a path to a file with the correct
            extension -- i.e. ``.xml`` for XML).

        as_string: bool
            .. versionadded:: 1.1.4

            If ``False`` (default), returns an ``ElementTree``
            object. Otherwise, returns the data as an XML string.

        Returns
        -------
        xml.etree.ElementTree.Element

        Examples
        --------
        >>> from xml.etree.ElementTree import tostring
        >>> filters = ["areaType=region"]
        >>> structure = {
        ...     "name": "areaName",
        ...     "newCases": "newCasesBySpecimenDate"
        ... }
        >>> data = Cov19API(
        ...     filters=filters,
        ...     structure=structure,
        ...     latest_by='newCasesBySpecimenDate'
        ... )
        >>> result_xml = data.get_xml()
        >>> result_str = tostring(result_xml, encoding='unicode', method='xml')
        >>> print(result_str)
        <document>
            <data>
                <name>East Midlands</name>
                <newCases>0</newCases>
            </data>
            ...
        </document>
        """
        from xml.etree.ElementTree import SubElement, fromstring

        resp = XMLElement("document")

        for response in self._get(DataFormat.XML):
            decoded_content = response.content.decode()

            # Parsing the XML:
            parsed_data = fromstring(decoded_content)

            # Extracting "data" elements from the tree:
            page_data = parsed_data.findall(".//data")

            resp.extend(page_data)

        extras = {
            "lastUpdate": self.last_update,
            "length": len(resp.findall(".//data")),
            "totalPages": self._total_pages
        }

        for elm_name, value in extras.items():
            elm = SubElement(resp, elm_name)
            elm.text = str(value)

        if save_as is None and not as_string:
            return resp

        from xml.etree.ElementTree import tostring

        str_data = tostring(resp, encoding='unicode', method='xml')

        if as_string:
            return str_data

        save_data(str_data, save_as, DataFormat.XML)

        return resp

[docs]    def get_csv(self, save_as=None) -> str:
        """
        Provides full data (all pages) in CSV.

        .. warning::

            Please make sure that the ``structure`` is not hierarchical as
            CSV outputs are defined as 2D tables and as such, do not support
            hierarchies.

        Parameters
        ----------
        save_as: Union[str, None]
            If defined, the results will (also) be saved as a
            file. [Default: ``None``]

            The value must be a path to a file with the correct
            extension -- i.e. ``.csv`` for CSV).

        Returns
        -------
        str

        Raises
        ------
        ValueError
            If the structure is nested.

        Examples
        --------
        >>> filters = ["areaType=region"]
        >>> structure = {
        ...     "name": "areaName",
        ...     "newCases": "newCasesBySpecimenDate"
        ... }
        >>> data = Cov19API(
        ...     filters=filters,
        ...     structure=structure,
        ...     latest_by='newCasesBySpecimenDate'
        ... )
        >>> result = data.get_csv()
        >>> print(result)
        name,newCases
        East Midlands,0
        ...
        """
        # Checks to ensure that the structure is
        # not hierarchical.
        if isinstance(self.structure, dict):
            non_str = filter(
                lambda val: not isinstance(val, str),
                self.structure.values()
            )

            if list(non_str):
                struct = dumps(self.structure, indent=4)
                raise ValueError("CSV structure cannot be nested. Received:\n%s" % struct)

        linebreak = "\n"
        resp = str()

        for page_num, response in enumerate(self._get(DataFormat.CSV), start=1):
            decoded_content = response.content.decode()

            # Removing CSV header (column names) where page
            # number is greater than 1.
            if page_num > 1:
                data_lines = decoded_content.split(linebreak)[1:]
                decoded_content = str.join(linebreak, data_lines)

            resp += decoded_content.strip() + linebreak

        if save_as is None:
            return resp

        save_data(resp, save_as, DataFormat.CSV)

        return resp

[docs]    def get_dataframe(self):
        """
        Provides the data as as ``pandas.DataFrame`` object.

        .. versionadded:: 1.2.0

        .. warning::

            The ``pandas`` library is not included in the dependencies of this
            library and must be installed separately.

        Returns
        -------
        DataFrame

        Raises
        ------
        ImportError
            If the ``pandas`` library is not installed.
        """
        try:
            from pandas import DataFrame
        except ImportError:
            raise ImportError(
                "The `pandas` library is not installed as a part of the `uk-covid19` "
                "library. Please install the library and try again."
            )

        data = self.get_json()
        df = DataFrame(data["data"])

        return df

    def __str__(self):
        resp = "COVID-19 in the UK - API Service\nCurrent parameters: \n"
        return resp + dumps(self.api_params, indent=4)

    __repr__ = __str__