Source code for subsettools.clm

"""Functions to configure CLM for coupled ParFlow-CLM simulations."""

import os
from datetime import timedelta
import numpy as np
import pandas as pd
import hf_hydrodata
from parflow import write_pfb
from ._common import (
    get_utc_time,
    get_hf_gridded_data,
)
from ._error_checking import (
    _validate_dir,
    _validate_grid_bounds,
    _validate_date,
)


_VEGM_COLUMNS = 25



[docs]
def config_clm(ij_bounds, start, end, dataset, write_dir, time_zone="UTC"):
    """Modify template CLM driver files for a desired subdomain and run duration.

    This function will obtain template clm driver files (specifically vegm, vep
    and drv_clmin) from the existing national simulations on HydroData and modify
    them to reflect the desired subdomain (indicated by the ij_bounds) and run
    duration (indicated by the start and end dates). The modified files will be
    written out to a user specified directory. These files are required if you
    are going to run a ParFlow-CLM simulation.

    Args:
        ij_bounds (tuple[int]): bounding box for subset. This should be given as
            i,j index values where 0,0 is the lower left hand corner of a domain.
            ij_bounds are given relative to whatever grid is being used for the
            subset.
        start (str): start date (inclusive), in the form 'yyyy-mm-dd'
        end (str): end date (exlusive), in the form 'yyyy-mm-dd'
        dataset (str): the dataset that the files should be obtained from name
            e.g. "conus1_baseline_mod"
        write_dir (str): directory where the subset files will be written
        time_zone (str): timezone information for start and end dates. This
            should be a zoneinfo-supported time zone. Defaults to "UTC".

    Returns:
        A dictionary mapping the CLM file types ("vegp", "vegm", "drv_clm") to
        the corresponging filepaths where the CLM files were written.

    Example:

    .. code-block:: python

        filepaths = config_clm(
            ij_bounds=(375, 239, 487, 329),
            start="2005-10-01",
            end="2006-10-01",
            dataset="conus1_baseline_mod",
            write_dir="/path/to/your/chosen/directory"
        )
    """
    _validate_grid_bounds(ij_bounds)
    _validate_date(start)
    _validate_date(end)
    if not isinstance(dataset, str):
        raise TypeError("dataset name must be a string.")
    _validate_dir(write_dir)
    if not isinstance(time_zone, str):
        raise TypeError("time_zone must be a string.")

    # get the pfb version of the vegm file
    file_type_list = ["vegp", "pfb", "drv_clm"]
    file_paths = {}
    for file_type in file_type_list:
        if file_type == "vegp":
            file_path = os.path.join(write_dir, "drv_vegp.dat")
            try:
                hf_hydrodata.get_raw_file(
                    file_path,
                    dataset=dataset,
                    file_type=file_type,
                    variable="clm_run",
                    temporal_resolution="static",
                )
            except ValueError as err:
                print(f"Failed to get {file_type} file for dataset '{dataset}':", err)
            else:
                file_paths[file_type] = file_path
                print("copied vegp")
        elif file_type == "pfb":
            options = {
                "dataset": dataset,
                "file_type": file_type,
                "variable": "clm_run",
                "temporal_resolution": "static",
                "grid_bounds": ij_bounds,
            }
            subset_data = get_hf_gridded_data(options)
            land_cover_data = _reshape_ndarray_to_vegm_format(subset_data)
            file_path = _write_land_cover(land_cover_data, write_dir)
            file_paths[file_type] = file_path
            print("subset vegm")
        elif file_type == "drv_clm":
            file_path = os.path.join(write_dir, "drv_clmin.dat")
            try:
                hf_hydrodata.get_raw_file(
                    file_path,
                    dataset=dataset,
                    file_type=file_type,
                    variable="clm_run",
                    temporal_resolution="static",
                )
            except ValueError as err:
                print(f"Failed to get {file_type} file for dataset '{dataset}':", err)
            else:
                print("copied drv_clmin")
                _edit_drvclmin(
                    file_path=file_path, start=start, end=end, time_zone=time_zone
                )
                file_paths[file_type] = file_path
                print("edited drv_clmin")
    return file_paths



def _reshape_ndarray_to_vegm_format(data):
    """Reshape ndarray returned by datacatalog to vegm format.

    Args:
        data (ndarray): raw subset vegm data (2d array)

    Returns:
        Ndarray reshaped to vegm format.
    """
    _, nj, ni = data.shape
    indices = np.indices((nj, ni)) + 1
    indices = indices[::-1, :, :]
    data = np.vstack([indices, data])  # stack x,y indices on vegm
    # transpose and reshape back into expected 2D vegm file format for the subset
    return data.transpose(1, 2, 0).reshape(-1, _VEGM_COLUMNS)


def _write_land_cover(land_cover_data, write_dir):
    """Write the land cover ndarray in vegm format.

    Read in a gridded landcover dataset and write it out as a vegm file which is
    correctly formatted for CLM.

    Args:
        land_cover_data (ndarray): formatted vegm data (2d array)
        write_dir (str): path to output directory

    Returns:
        str:
            path to output vegm file.
    """
    heading = (
        "x y lat lon sand clay color fractional coverage of grid, by "
        "vegetation class (Must/Should Add to 1.0) "
    )
    vegm_col_names = (
        "",
        "",
        "(Deg)",
        "(Deg)",
        "(%/100)",
        "",
        "index",
        "1",
        "2",
        "3",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        "10",
        "11",
        "12",
        "13",
        "14",
        "15",
        "16",
        "17",
        "18",
    )
    header = "\n".join([heading, " ".join(vegm_col_names)])
    file_path = os.path.join(write_dir, "drv_vegm.dat")
    np.savetxt(
        fname=file_path,
        X=land_cover_data,
        delimiter=" ",
        comments="",
        header=header,
        fmt=["%d"] * 2 + ["%.6f"] * 2 + ["%.2f"] * 2 + ["%d"] * 19,
    )
    return file_path


def _edit_drvclmin(
    file_path,
    start=None,
    end=None,
    time_zone="UTC",
    startcode=2,
    vegp_name="drv_vegp.dat",
    vegm_name="drv_vegm.dat",
):
    """Edit a template CLM driver for a new simulation.

    Update the start and end dates, timezone, restart type and vegm and vegp
    input file names for a new simulation.

    Args:
        file_path (str): clm driver file path
        start (str): start date (inclusive), in the form 'yyyy-mm-dd'
        end (str): end date (exlusive), in the form 'yyyy-mm-dd'
        time_zone (str): time_zone used to calculate start/end dates. Defaults
            to "UTC".
        startcode (int): startcode for the parflow simulation
        vegp_name (str): vegp filename
        vegm_name (str): vegm filename

    Raises:
        AssertionError: If one of start or end is None and the other is not.
    """
    assert (start is None and end is None) or (start is not None and end is not None)

    with open(file_path, encoding="utf-8") as f:
        lines = f.readlines()

    for i, line in enumerate(lines):
        if "vegtf" in line:
            lines[i] = (
                f"{'vegtf':<15}{vegm_name:<37} Vegetation Tile Specification File\n"
            )
        elif "vegpf" in line:
            lines[i] = f"{'vegpf':<15}{vegp_name:<37} Vegetation Type Parameter\n"
        elif "startcode" in line:
            lines[i] = f"{'startcode':<15}{startcode:<37} 1=restart file, 2=defined\n"
        elif "clm_ic" in line:
            lines[i] = f"{'clm_ic':<15}{startcode:<37} 1=restart file, 2=defined\n"

    if start is not None:
        start_date = get_utc_time(start, time_zone)
        end_date = get_utc_time(end, time_zone) - timedelta(hours=1)

        for i, line in enumerate(lines):
            if "shr" in line:
                lines[i] = f"{'shr':<15}{start_date.hour:<37} Starting Hour\n"
            if "sda" in line:
                lines[i] = f"{'sda':<15}{start_date.day:<37} Starting Day\n"
            elif "smo" in line:
                lines[i] = f"{'smo':<15}{start_date.month:<37} Starting Month\n"
            elif "syr" in line:
                lines[i] = f"{'syr':<15}{start_date.year:<37} Starting Year\n"
            elif "ehr" in line:
                lines[i] = f"{'ehr':<15}{end_date.hour:<37} Ending Hour\n"
            elif "eda" in line:
                lines[i] = f"{'eda':<15}{end_date.day:<37} Ending Day\n"
            elif "emo" in line:
                lines[i] = f"{'emo':<15}{end_date.month:<37} Ending Month\n"
            elif "eyr" in line:
                lines[i] = f"{'eyr':<15}{end_date.year:<37} Ending Year\n"

    with open(file_path, "w", encoding="utf-8") as f:
        f.writelines(lines)



[docs]
def vegm_to_land_cover(vegm_path, write_pfb_path=None):
    """
    Convert a vegm.dat file in CLM format into a land cover array.

    This function assumes the vegm.dat file is in the standard format.
    That is, the file has 1 row per grid cell and each row contains 25 columns
    The columns are ordered as: x, y, lat, lon, sand, clay, color, then 18
    columns representing the fractional coverage of the grid cell by vegetation
    class (these final 18 columns add to 1.0 for each row). The rows are in
    ascending order by grid cell index with y as the outer loop and x as the
    inner loop.

    In cases in which the fractional vegetation coverage results in a tie
    between multiple vegetation classes, the final land cover array will
    use the first (lowest) land cover designation to break the tie
    (ie. the land cover array will contain designation 1 for a grid cell
    in which the vegetation distribution is 0.5 class 1 and 0.5 class 5).

    Args:
        vegm_path (str): path to vegm file
        write_pfb_path (str; optional): path to write output .pfb file to disk

    Returns:
        NumPy array containing the calculated land cover type for
        each domain grid cell.

        If `pfb_path` is provided, .pfb file is written to disk at the
        specified path.

    Example:

    .. code-block:: python

        land_cover_array = vegm_to_land_cover("/path/to/vegm/vegm.dat")
    """

    # Read in .dat file
    df = pd.read_csv(vegm_path, sep=r"\s+", skiprows=2, header=None)
    df.columns = [f"c{i}" for i in range(df.shape[1])]

    # Number of columns and rows determined by last line of file
    nx = int(df.iloc[-1]["c0"])
    ny = int(df.iloc[-1]["c1"])

    # Remove first seven columns: x,y,lat,lon,sand,clay,color index
    feature_cols = df.columns[7:]

    # Stack everything into (ny, nx, n_features)
    data = np.stack([df[c].values.reshape((ny, nx)) for c in feature_cols], axis=-1)

    # Reshape data into z, y, x where z is the vegm type
    data = np.transpose(data, (2, 0, 1))

    # Reduce array of land cover indicators to a single land cover type
    # Add 1 to account for land cover indexing starting from 1 instead of 0
    # The array land_cover will be of shape (y, x)
    # Note: np.argmax breaks ties by returning the index of the first
    # occurrence of the maximum value
    land_cover = (np.argmax(data[:, :, :], axis=0) + 1).astype(np.float64)

    # Write to disk as .pfb if requested
    if write_pfb_path:
        try:
            write_pfb(write_pfb_path, land_cover, dist=False)
        except ValueError as err:
            print(f"Failed to write .pfb to {write_pfb_path}:", err)

    return land_cover