Source code for pyxpcm.tutorial

# -*coding: UTF-8 -*-
"""

    Useful for documentation and to play with pyXpcm

    Data files should be hosted on another repo

"""

import os
# from os.path import dirname, join
import numpy as np
import xarray as xr

import hashlib
from urllib.request import urlretrieve

_default_cache_dir = os.sep.join(("~", ".pyxpcm_tutorial_data"))

[docs]def open_dataset(name):
    """ Open a dataset from the pyXpcm online data repository (requires internet).

        If a local copy is found then always use that to avoid network traffic.

        Parameters
        ----------
        name : str
            Name of the dataset to load among:

                - `dummy` (depth,sample) dummy array
                - `argo`  (depth,sample) real Argo data sample
                - `isas_snapshot` (depth,latitude,longitude) real gridded product
                - `isas_series` (depth,latitude,longitude,time) real gridded product time series

        Returns
        -------
        :class:`xarray.Dataset`

    """

    if name == 'argo':
        acc = argo_loader(what='sample')

    elif name == 'isas_snapshot':
        acc = isas_loader(what='sample_snapshot')

    elif name == 'isas_series':
        acc = isas_loader(what='sample_series')

    elif name == 'dummy':
        acc = dummy()

    elif name == 'orsi':
        acc = orsi()

    else:
        raise ValueError("Don't know this tutorial dataset")

    return acc

class dummy():

    def load(self, Np=1000, Nz=50):
        z = np.linspace(0, -500, Nz)
        ds = xr.Dataset({
            'TEMP': xr.DataArray(np.random.rand(Np, Nz),
                                 dims=['n_prof', 'depth'], coords={'depth': z}),
            'PSAL': xr.DataArray(np.random.rand(Np, Nz),
                                 dims=['n_prof', 'depth'], coords={'depth': z})
            })
        ds['depth'].attrs['axis'] = 'Z'
        ds['depth'].attrs['units'] = 'meters'
        ds['depth'].attrs['positive'] = 'up'
        ds['TEMP'].attrs['feature_name'] = 'temperature'
        ds['PSAL'].attrs['feature_name'] = 'salinity'
        ds.attrs['comment'] = "Dummy fields with random values"
        return ds

class argo_loader():
    def __init__(self, what='sample'):
        categories = ['sample']
        if what not in categories:
            raise ValueError("I can't load a '%s' of Argo data" % what)
        else:
            self.category = what
        pass

    def load(self):
        """Load and return a sample of Argo profiles on standard depth levels"""
        if self.category == 'sample':
            ncfile = 'argo_sample'
            ds = _open_dataset(ncfile)
            #todo I need to add these attributes directly into the netcdf file
            ds['DEPTH'].attrs['axis'] = 'Z'
            ds['DEPTH'].attrs['units'] = 'meters'
            ds['DEPTH'].attrs['positive'] = 'up'
            ds.attrs = dict()
            ds.attrs['Sample test prepared by'] = "G. Maze"
            ds.attrs['Institution'] = "Ifremer/LOPS"
            ds.attrs['Data source DOI'] = "10.17882/42182"
        return ds

class isas_loader():
    def __init__(self, what='sample_snapshot', version='15'):
        self.version = version
        categories = ['sample_snapshot','sample_series']
        if what not in categories:
            raise ValueError("I can't load a '%s' of ISAS data" % what)
        else:
            self.category = what
        pass

    def load(self):
        """Load and return a sample of ISAS profiles on standard depth levels"""
        if self.category == 'sample_snapshot':
            ncfile = 'isas15_sample'
            ds = _open_dataset(ncfile)
            #todo I need to add these attributes directly into the netcdf file
            ds['depth'] = -np.abs(ds['depth'])
            ds['depth'].attrs['axis'] = 'Z'
            ds['depth'].attrs['units'] = 'meters'
            ds['depth'].attrs['positive'] = 'up'
            ds['SST'] = ds['TEMP'].isel(depth=0)
            ds = ds.chunk({'latitude': None, 'longitude': None})

        elif self.category == 'sample_series':
            ncfile = 'isas15series_sample'
            ds = _open_dataset(ncfile)
            #todo I need to add these attributes directly into the netcdf file
            ds['depth'] = -np.abs(ds['depth'])
            ds['depth'].attrs['axis'] = 'Z'
            ds['depth'].attrs['units'] = 'meters'
            ds['depth'].attrs['positive'] = 'up'
            ds['SST'] = ds['TEMP'].isel(depth=0)
            ds = ds.chunk({'latitude': None, 'longitude': None, 'time': None})

        return ds

class orsi():
    def load(self):
        """Load path of ORSI fronts"""
        ncfile = 'ORSIfronts'
        ds = _open_dataset(ncfile)
        return ds

#######
# This is heavily borrowed/copied from https://github.com/pydata/xarray/blob/master/xarray/tutorial.py

def file_md5_checksum(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        hash_md5.update(f.read())
    return hash_md5.hexdigest()

# idea borrowed from Seaborn
def _open_dataset(
                name,
                cache=True,
                cache_dir=_default_cache_dir,
                github_url="https://github.com/obidam/pyxpcm-data",
                branch="master",
                **kws,
                ):
    """
    Open a dataset from the pyXpcm online data repository (requires internet).
    If a local copy is found then always use that to avoid network traffic.

        Parameters
        ----------
        name : str
           Name of the netcdf file containing the dataset
           ie. 'argo_sample'
        cache_dir : string, optional
            The directory in which to search for and write cached data.
        cache : boolean, optional
            If True, then cache data locally for use on subsequent calls
        github_url : string
            Github repository where the data is stored
        branch : string
            The git branch to download from
        kws : dict, optional
            Passed to xarray.open_dataset

        Returns
        -------
        :class:`xarray.Dataset`

    """
    longdir = os.path.expanduser(cache_dir)
    fullname = name + ".nc"
    localfile = os.sep.join((longdir, fullname))
    md5name = name + ".md5"
    md5file = os.sep.join((longdir, md5name))

    if not os.path.exists(localfile):

        # This will always leave this directory on disk.
        # May want to add an option to remove it.
        if not os.path.isdir(longdir):
            os.mkdir(longdir)

        url = "/".join((github_url, "raw", branch, fullname))
        urlretrieve(url, localfile)
        url = "/".join((github_url, "raw", branch, md5name))
        urlretrieve(url, md5file)

        localmd5 = file_md5_checksum(localfile)
        with open(md5file, "r") as f:
            remotemd5 = f.read()
        if localmd5 != remotemd5:
            os.remove(localfile)
            msg = """
            MD5 checksum does not match, try downloading dataset again.
            """
            raise OSError(msg)

    ds = xr.open_dataset(localfile, **kws)

    if not cache:
        ds = ds.load()
        os.remove(localfile)

    return ds