Source code for ost.s1.search

#! /usr/bin/env python
# -*- coding: utf-8 -*-

"""
Based on a set of search parameters the script will create a query
on www.scihub.copernicus.eu and return the results either
as shapefile, sqlite, or write to a PostGreSQL database.

----------------
Functions:
----------------

    gdfInv2Pg:
        writes the search result into a PostGreSQL/PostGIS Database
    gdfInv2Sqlite: (tba)
        writes the search result into a SqLite/SpatiaLite Database

------------------
Main function
------------------
  scihubSearch:
    handles the whole search process, i.e. login, query creation, search
    and write to desired output format

------------------
Contributors
------------------

Andreas Vollrath, ESA phi-lab
-----------------------------------
August 2018: Original implementation

------------------
Usage
------------------

python3 search.py -a /path/to/aoi-shapefile.shp -b 2018-01-01 -e 2018-31-12
                   -t GRD -m VV -b IW -o /path/to/search.shp

    -a         defines ISO3 country code or path to an ESRI shapefile
    -s         defines the satellite platform (Sentinel-1, Sentinel-2, etc.)
    -b         defines start date*
    -e         defines end date for search*
    -t         defines the product type (i.e. RAW,SLC or GRD)*
    -m         defines the polarisation mode (VV, VH, HH or HV)*
    -b         defines the beammode (IW,EW or SM)*
    -o         defines output that can be a shapefile (ending with .shp),
               a SQLite DB (ending with .sqlite) or a PostGreSQL DB (no suffix)
    -u         the scihub username*
    -p         the scihub secret password*

    * optional, i.e will look for all available products as well as ask for
      username and password during script execution
"""

# import stdlib modules
import os
import sys
import datetime
import logging
from urllib.error import URLError
import xml.dom.minidom
import dateutil.parser
from pathlib import Path

# import external modules
import pandas as pd
import geopandas as gpd
from shapely.wkt import dumps, loads

# internal OST libs
from ost.helpers.db import pgHandler
from ost.helpers import scihub

# set up logger
logger = logging.getLogger(__name__)

CONNECTION_ERROR = "We failed to connect to the server. Reason: "
CONNECTION_ERROR_2 = "The server couldn't fulfill the request. Error code: "


def _read_xml(dom):

    acq_list = []
    # loop through each entry (with all metadata)
    for node in dom.getElementsByTagName("entry"):

        # we get all the date entries
        dict_date = {
            s.getAttribute("name"): dateutil.parser.parse(s.firstChild.data).astimezone(dateutil.tz.tzutc())
            for s in node.getElementsByTagName("date")
        }

        # we get all the int entries
        dict_int = {s.getAttribute("name"): s.firstChild.data for s in node.getElementsByTagName("int")}

        # we create a filter for the str entries (we do not want all)
        # and get them
        dict_str = {s.getAttribute("name"): s.firstChild.data for s in node.getElementsByTagName("str")}

        # merge the dicts and append to the catalogue list
        acq = dict(dict_date, **dict_int, **dict_str)

        # fill in emtpy fields in dict by using identifier
        if "swathidentifier" not in acq.keys():
            acq["swathidentifier"] = acq["identifier"].split("_")[1]
        if "producttype" not in acq.keys():
            acq["producttype"] = acq["identifier"].split("_")[2]
        if "slicenumber" not in acq.keys():
            acq["slicenumber"] = 0

        # append all scenes from this page to a list
        acq_list.append(
            [
                acq["identifier"],
                acq["polarisationmode"],
                acq["orbitdirection"],
                acq["beginposition"].strftime("%Y%m%d"),
                acq["relativeorbitnumber"],
                acq["orbitnumber"],
                acq["producttype"],
                acq["slicenumber"],
                acq["size"],
                acq["beginposition"].isoformat(),
                acq["endposition"].isoformat(),
                acq["lastrelativeorbitnumber"],
                acq["lastorbitnumber"],
                acq["uuid"],
                acq["platformidentifier"],
                acq["missiondatatakeid"],
                acq["swathidentifier"],
                acq["ingestiondate"].isoformat(),
                acq["sensoroperationalmode"],
                loads(acq["footprint"]),
            ]
        )

    # transform all results from that page to a gdf
    return acq_list


def _query_scihub(opener, query):
    """
    Get the data from the scihub catalogue
    and write it to a GeoPandas GeoDataFrame
    """

    # create empty GDF
    columns = [
        "identifier",
        "polarisationmode",
        "orbitdirection",
        "acquisitiondate",
        "relativeorbitnumber",
        "orbitnumber",
        "producttype",
        "slicenumber",
        "size",
        "beginposition",
        "endposition",
        "lastrelativeorbitnumber",
        "lastorbitnumber",
        "uuid",
        "platformidentifier",
        "missiondatatakeid",
        "swathidentifier",
        "ingestiondate",
        "sensoroperationalmode",
        "geometry",
    ]

    crs = "epsg:4326"
    geo_df = gpd.GeoDataFrame(columns=columns, crs=crs)

    # we need this for the paging
    index, rows, next_page = 0, 99, 1

    while next_page:

        # construct the final url
        url = query + f"&rows={rows}&start={index}"
        try:
            # get the request
            req = opener.open(url)
        except URLError as error:
            if hasattr(error, "reason"):
                logger.info(f"{CONNECTION_ERROR}{error.reason}")
                sys.exit()
            elif hasattr(error, "code"):
                logger.info(f"{CONNECTION_ERROR_2}{error.code}")
                sys.exit()
        else:
            # write the request to to the response variable
            # (i.e. the xml coming back from scihub)
            response = req.read().decode("utf-8")

            # parse the xml page from the response
            dom = xml.dom.minidom.parseString(response)

            acq_list = _read_xml(dom)

            gdf = gpd.GeoDataFrame(acq_list, columns=columns, crs=crs)

            # append the gdf to the full gdf
            geo_df = pd.concat([geo_df, gdf])

        # retrieve next page and set index up by 99 entries
        next_page = scihub.next_page(dom)
        index += rows

    return geo_df


def _to_shapefile(gdf, outfile, append=False):

    # check if file is there
    if os.path.isfile(outfile):

        # in case we want to append, we load the old one and add the new one
        if append:
            columns = [
                "id",
                "identifier",
                "polarisationmode",
                "orbitdirection",
                "acquisitiondate",
                "relativeorbit",
                "orbitnumber",
                "product_type",
                "slicenumber",
                "size",
                "beginposition",
                "endposition",
                "lastrelativeorbitnumber",
                "lastorbitnumber",
                "uuid",
                "platformidentifier",
                "missiondatatakeid",
                "swathidentifier",
                "ingestiondate",
                "sensoroperationalmode",
                "geometry",
            ]

            # get existing geodataframe from file
            old_df = gpd.read_file(outfile)
            old_df.columns = columns
            # drop id
            old_df.drop("id", axis=1, inplace=True)
            # append new results
            gdf.columns = columns[1:]
            gdf = old_df.append(gdf)

            # remove duplicate entries
            gdf.drop_duplicates(subset="identifier", inplace=True)

        # remove old file
        os.remove(outfile)
        os.remove("{}.cpg".format(outfile[:-4]))
        os.remove("{}.prj".format(outfile[:-4]))
        os.remove("{}.shx".format(outfile[:-4]))
        os.remove("{}.dbf".format(outfile[:-4]))

    # calculate new index
    gdf.insert(loc=0, column="id", value=range(1, 1 + len(gdf)))

    # write to new file
    if len(gdf.index) >= 1:
        gdf.to_file(outfile)
    else:
        logger.info("No scenes found in this AOI during this time")


def _to_geopackage(gdf, outfile, append=False):

    # check if file is there
    if Path(outfile).exists():

        # in case we want to append, we load the old one and add the new one
        if append:
            columns = [
                "id",
                "identifier",
                "polarisationmode",
                "orbitdirection",
                "acquisitiondate",
                "relativeorbit",
                "orbitnumber",
                "product_type",
                "slicenumber",
                "size",
                "beginposition",
                "endposition",
                "lastrelativeorbitnumber",
                "lastorbitnumber",
                "uuid",
                "platformidentifier",
                "missiondatatakeid",
                "swathidentifier",
                "ingestiondate",
                "sensoroperationalmode",
                "geometry",
            ]

            # get existing geodataframe from file
            old_df = gpd.read_file(outfile)
            old_df.columns = columns
            # drop id
            old_df.drop("id", axis=1, inplace=True)
            # append new results
            gdf.columns = columns[1:]
            gdf = old_df.append(gdf)

            # remove duplicate entries
            gdf.drop_duplicates(subset="identifier", inplace=True)

        # remove old file
        Path(outfile).unlink()

    # calculate new index
    gdf.insert(loc=0, column="id", value=range(1, 1 + len(gdf)))

    # write to new file
    if len(gdf.index) > 0:
        gdf.to_file(outfile, driver="GPKG")
    else:
        logger.info("No scenes found in this AOI during this time")


def _to_postgis(gdf, db_connect, outtable):

    # check if tablename already exists
    db_connect.cursor.execute(
        "SELECT EXISTS (SELECT * FROM "
        "information_schema.tables WHERE "
        "LOWER(table_name) = "
        "LOWER('{}'))".format(outtable)
    )
    result = db_connect.cursor.fetchall()
    if result[0][0] is False:
        logger.info(f"Table {outtable} does not exist in the database. Creating it...")
        db_connect.pgCreateS1("{}".format(outtable))
        maxid = 1
    else:
        try:
            maxid = db_connect.pgSQL(f"SELECT max(id) FROM {outtable}")
            maxid = maxid[0][0]
            if maxid is None:
                maxid = 0

            logger.info(
                f"Table {outtable} already exists with {maxid} entries. "
                f"Will add all non-existent results to this table."
            )
            maxid = maxid + 1
        except Exception:
            raise RuntimeError(
                f"Existent table {outtable} does not seem to be compatible " f"with Sentinel-1 data."
            )

    # add an index as first column
    gdf.insert(loc=0, column="id", value=range(maxid, maxid + len(gdf)))
    db_connect.pgSQLnoResp(f"SELECT UpdateGeometrySRID('{outtable.lower()}', 'geometry', 0);")

    # construct the SQL INSERT line
    for _index, row in gdf.iterrows():

        row["geometry"] = dumps(row["footprint"])
        row.drop("footprint", inplace=True)
        identifier = row.identifier
        uuid = row.uuid
        line = tuple(row.tolist())

        # first check if scene is already in the table
        result = db_connect.pgSQL("SELECT uuid FROM {} WHERE " "uuid = '{}'".format(outtable, uuid))
        try:
            result[0][0]
        except IndexError:
            logger.info(f"Inserting scene {identifier} to {outtable}")
            db_connect.pgInsert(outtable, line)
            # apply the dateline correction routine
            db_connect.pgDateline(outtable, uuid)
            maxid += 1
        else:
            logger.info(f"Scene {identifier} already exists within table {outtable}.")

    logger.info(f"Inserted {len(gdf)} entries into {outtable}.")
    logger.info(f"Table {outtable} now contains {maxid - 1} entries.")
    logger.info("Optimising database table.")

    # drop index if existent
    try:
        db_connect.pgSQLnoResp("DROP INDEX {}_gix;".format(outtable.lower()))
    except Exception:
        pass

    # create geometry index and vacuum analyze
    db_connect.pgSQLnoResp("SELECT UpdateGeometrySRID('{}', " "'geometry', 4326);".format(outtable.lower()))
    db_connect.pgSQLnoResp(
        "CREATE INDEX {}_gix ON {} USING GIST " "(geometry);".format(outtable, outtable.lower())
    )
    db_connect.pgSQLnoResp("VACUUM ANALYZE {};".format(outtable.lower()))


[docs]def check_availability(inventory_gdf, download_dir, data_mount):
    """This function checks if the data is already downloaded or
       available through a mount point on DIAS cloud

    :param inventory_gdf:
    :param download_dir:
    :param data_mount:
    :return:
    """

    from ost import Sentinel1Scene

    # add download path, or set to None if not found
    inventory_gdf["download_path"] = inventory_gdf.identifier.apply(
        lambda row: str(Sentinel1Scene(row).get_path(download_dir, data_mount))
    )

    return inventory_gdf


[docs]def scihub_catalogue(
    query_string,
    output,
    append=False,
    uname=None,
    pword=None,
    base_url="https://apihub.copernicus.eu/apihub",
):
    """This is the main search function on scihub

    :param query_string:
    :param output:
    :param append:
    :param uname:
    :param pword:
    :param base_url:
    :return:
    """

    # retranslate Path object to string
    output = str(output)

    # get connected to scihub
    hub = f"{base_url}/search?q="
    opener = scihub.connect(uname, pword, base_url)
    query = f"{hub}{query_string}"

    # get the catalogue in a dict
    gdf = _query_scihub(opener, query)

    if output[-4:] == ".shp":
        logger.info(f"Writing inventory data to shape file: {output}")
        _to_shapefile(gdf, output, append)
    elif output[-5:] == ".gpkg":
        logger.info(f"Writing inventory data to geopackage file: {output}")
        _to_geopackage(gdf, output, append)
    else:
        logger.info(f"Writing inventory data toPostGIS table: {output}")
        db_connect = pgHandler()
        _to_postgis(gdf, db_connect, output)


if __name__ == "__main__":

    import argparse
    import urllib
    from ost.helpers import helpers

    # get the current date
    NOW = datetime.datetime.now()
    NOW = NOW.strftime("%Y-%m-%d")

    # write a description
    DESCRIPT = """
               This is a command line client for the inventory of Sentinel-1
               data on the Copernicus Scihub server.
               Output can be either an:
                    - exisiting PostGreSQL database
                    - newly created or existing SqLite database
                    - ESRI Shapefile
               """

    EPILOG = """
             Examples:
             search.py -a /path/to/aoi-shapefile.shp -b 2018-01-01
                       -e 2018-31-12
             """
    # create a PARSER
    PARSER = argparse.ArgumentParser(description=DESCRIPT, epilog=EPILOG)

    # username/password scihub
    PARSER.add_argument("-u", "--username", help=" Your username of scihub.copernicus.eu ", default=None)
    PARSER.add_argument(
        "-p",
        "--password",
        help=" Your secret password of scihub.copernicus.eu ",
        default=None,
    )
    PARSER.add_argument(
        "-a",
        "--areaofinterest",
        help=(" The Area of Interest (path to a shapefile" "or ISO3 country code)"),
        dest="aoi",
        default="*",
        type=lambda x: helpers.is_valid_aoi(PARSER, x),
    )
    PARSER.add_argument(
        "-b",
        "--begindate",
        help=" The Start Date (format: YYYY-MM-DD) ",
        default="2014-10-01",
        type=lambda x: helpers.is_valid_date(PARSER, x),
    )
    PARSER.add_argument(
        "-e",
        "--enddate",
        help=" The End Date (format: YYYY-MM-DD)",
        default=NOW,
        type=lambda x: helpers.is_valid_date(PARSER, x),
    )
    PARSER.add_argument("-t", "--producttype", help=" The Product Type (RAW, SLC, GRD, *) ", default="*")
    PARSER.add_argument(
        "-m",
        "--polarisation",
        help=" The Polarisation Mode (VV, VH, HH, HV, *) ",
        default="*",
    )
    PARSER.add_argument("-b", "--beammode", help=" The Beam Mode (IW, EW, SM, *) ", default="*")

    # output parameters
    PARSER.add_argument(
        "-o",
        "--output",
        help=(
            " Output format/file. Can be a shapefile"
            " (ending with .shp), a SQLite file"
            " (ending with .sqlite) or a PostGreSQL table"
            " (connection needs to be configured). "
        ),
        required=True,
    )

    ARGS = PARSER.parse_args()

    # execute full search
    if ARGS.aoi != "*" and ARGS.aoi[-4] == "shp":
        AOI = os.path.abspath(ARGS.aoi)
    else:
        AOI = "*"

    # construct the search command (do not change)
    AOI = scihub.create_aoi_str(AOI)
    TOI = scihub.create_toi_str(ARGS.begindate, ARGS.enddate)
    PRODUCT_SPECS = scihub.create_s1_product_specs(ARGS.producttype, ARGS.polarisation, ARGS.beam)

    QUERY = urllib.parse.quote(f"Sentinel-1 AND {PRODUCT_SPECS} AND {AOI} AND {TOI}")
    # execute full search
    scihub_catalogue(QUERY, ARGS.output, ARGS.username, ARGS.password)