Source code for nwm_region_mgr.formreg.config_schema
"""Configuration schema for the formulation regionalization application.
config_schema.py defines the Pydantic models that represent the configuration
Classes:
- FormulationGeneralSettings: General settings for the formulation regionalization application.
- BestFormulation: Configuration for determining the best formulation for each spatial unit.
- FormulationSpatialUnitConfig: Spatial discretization settings for formulation regionalization.
- MetricConfig: Configuration for an individual metric used in summary scoring.
- MetricEvalPeriod: Configuration for the evaluation period of metrics to be used for screening donors.
- FormulationSummaryScoreConfig: Configuration for computing a summary score for formulation as a weighted average of normalized metrics.
- FormulationCostConfig: Computational cost of each formulation.
- FormulationOutputConfig: Output configuration for formulation regionalization.
- Config: Top-level configuration for formulation regionalization.
"""
import logging
from enum import Enum
from pathlib import Path
from typing import Dict, List, Literal, Union
from pydantic import BaseModel, Field, model_validator
from nwm_region_mgr.utils import (
BaseConfig,
BaseGeneralConfig,
BaseOutputConfig,
check_options,
)
logger = logging.getLogger(__name__)
[docs]
class FormulationGeneralSettings(BaseGeneralConfig):
"""General settings for the formulation regionalization application."""
huc12_hydrofabric_file: Union[str, Path] | None = Field(
description="Path to HUC12 hydrofabric file containing HUC12 polygons for spatial discretization.",
examples="NationalWBDSnapshot.gdb",
default=None,
)
divide_huc12_cwt_file: str | None = Field(
description=(
"Path to crosswalk file between HUC12 basins and NextGen catchments, "
"with columns 'divide_id' and 'huc_12'."
),
examples="cwt_divide_huc12_{domain}.csv",
default=None,
)
calib_basins_only: bool = Field(
description=(
"Whether to run formulation selection only for calibrated basins (based on summary score). "
"Set to True to limit formulation selection to calibrated basins only; in such cases, "
"parameter regionalization for uncalibrated catchments will not consider preferred formulations."
),
examples=False,
default=False,
)
formulation_to_include: List[str] | None = Field(
description=(
"List of formulations to consider. If None, all available formulations are considered. "
"If 'all', all formulations are included."
),
examples=[
"noah-owp-modular cfe-s t-route",
"noah-owp-modular ueb cfe-x t-route",
],
default=None,
)
formulation_to_exclude: List[str] | None = Field(
description="List of formulations to exclude. If None, no formulations are excluded from available options.",
examples=["noah-owp-modular cfe-s t-route"],
default=None,
)
consider_cost: bool = Field(
description="Whether to consider computational costs of formulations in the regionalization process.",
examples=False,
default=True,
)
@model_validator(mode="after")
def check_approach_calib_basins(self) -> "FormulationGeneralSettings":
"""Ensure that the approach for assigning formulation to calibrated basins is valid."""
valid_approaches = ["regionalization", "summary_score"]
check_options(
self.approach_calib_basins, valid_approaches, "approach_calib_basins"
)
return self
[docs]
class BestFormulation(BaseModel):
"""Configuration for determining the best formulation for each spatial unit."""
method: Literal["total_score", "average_score"] = Field(
description=(
"Method to determine the best formulation, options: 'total_score', 'average_score', which "
"selects the formulation with the highest total or average summary score across all "
"subdivisions (e.g., basins or divides as specified by the 'type' field), respectively."
),
examples="total_score",
default="total_score",
)
type: Literal["basin", "divide"] = Field(
description="Type of subdivision to use for computing total or average score, options: 'basin', 'divide'.",
examples="basin",
)
tolerance: float = Field(
description=(
"Tolerance (on scale of 0.0 to 1.0) for the summary score. Formulations within this tolerance of the best "
"score are considered equally good."
),
examples=0.05,
default=0.05,
ge=0.0,
le=1.0,
)
[docs]
class FormulationSpatialUnitConfig(BaseModel):
"""Spatial discretization settings for formulation regionalization."""
huc_level: str = Field(
description=(
"USGS HUC level used for spatial discretization (e.g., 'huc8'). "
"A single formulation is selected per spatial unit given the spatial discretization level. "
"Accepted formats: 'huc8', 'HUC8', 'huc-8'."
),
examples=["huc2", "huc4", "huc6", "huc8", "huc10", "huc12"],
default="huc8",
)
nmin_calib_basin: int = Field(
description="Minimum number of calibration basins required per spatial unit for valid formulation selection.",
examples=5,
default=3,
)
basin_fill_method: Literal["upscaling", "nearest-neighbor"] = Field(
description=(
"Method to handle spatial units with too few calibration basins. Options: "
"'upscaling' (by upscaling to a coarser spatial unit), and "
"'nearest-neighbor' (by pooling basins from neighboring units)."
),
examples="upscaling",
default="upscaling",
)
best_formulation: BestFormulation = Field(
description="Strategy to determine the best formulation for each spatial unit.",
examples={"method": "total_score", "type": "divide", "tolerance": 0.05},
default_factory=BestFormulation,
)
@model_validator(mode="before")
def normalize_huc(cls, values):
"""Normalize huc_level to standard format and validate."""
huc = values.get("huc_level")
if isinstance(huc, str):
# Lowercase, remove optional hyphen
normalized = huc.lower().replace("-", "")
valid_hucs = {f"huc{i}" for i in [2, 4, 6, 8, 10, 12]}
if normalized not in valid_hucs:
raise ValueError(
f"Invalid huc_level: {huc}. Must be one of {sorted(valid_hucs)}"
)
# Standardize to format hucX (e.g., huc8)
values["huc_level"] = normalized
return values
[docs]
class Orientation(str, Enum):
"""Orientation for metrics: 'positive' means higher is better, 'negative' means lower is better."""
positive = "positive"
negative = "negative"
[docs]
class MetricConfig(BaseModel):
"""Configuration for an individual metric used in summary scoring."""
upper: float | None = Field(
description="Upper bound for scaling and normalization, must be greater than lower bound.",
examples=1,
default=None,
)
lower: float | None = Field(
description="Lower bound for scaling and normalization, must be less than upper bound.",
examples=0,
default=None,
)
orientation: Literal["positive", "negative"] = Field(
description="Orientation of the metric, either 'positive' or 'negative'.",
examples="positive",
default="positive",
)
weight: float = Field(
description=(
"Weight of the metric in the summary score, must be between 0.0 and 1.0. "
"If 0.0, the metric is ignored."
),
examples=0.25,
default=0.0,
ge=0.0,
le=1.0,
)
absolute: bool = Field(
description="Whether to use the absolute value of the metric (e.g., for bias) for normalization.",
examples=False,
default=False,
)
[docs]
class MetricEvalPeriod(BaseModel):
"""Configuration for the evaluation period of metrics to be used for screening donors."""
col_name: str = Field(
description="Name of the column in the donor stats file that contains the evaluation period.",
examples="evalPeriod",
default="evalPeriod",
)
value: str = Field(
description="Value of the evaluation period to filter the donor stats file.",
examples=["valid", "calib", "full"],
default="full",
)
[docs]
class FormulationSummaryScoreConfig(BaseModel):
"""Configuration for computing a summary score for formulation as a weighted average of normalized metrics."""
metric_eval_period: MetricEvalPeriod | None = Field(
description="Evaluation period of metrics to be used for screening donors.",
examples={"col_name": "evalPeriod", "value": "valid"},
default=None,
)
metrics: Dict[str, MetricConfig] = Field(
description=(
"Dictionary of metrics used in the summary score, keyed by metric name. "
"Metric names must match columns in the calibration/validation stats file. Weights must sum to 1.0. "
"Refer to schema of MetricConfig for individual metric settings."
),
examples={
"cor": {
"upper": 1.0,
"lower": -0.5,
"orientation": "positive",
"weight": 0.5,
},
"kge": {
"upper": 1.0,
"lower": -0.5,
"orientation": "positive",
"weight": 0.5,
},
},
)
@model_validator(mode="after")
def remove_zero_weighted_metrics(self) -> "FormulationSummaryScoreConfig":
"""Remove metrics with a weight of 0.0 from the configuration.
Returns:
A new FormulationSummaryScoreConfig instance with zero-weighted metrics removed.
"""
active_metrics = {
name: metric for name, metric in self.metrics.items() if metric.weight > 0.0
}
missing_metrics = set(self.metrics) - set(active_metrics)
if missing_metrics:
logger.debug(
f"Removed metrics with zero weight: {', '.join(missing_metrics)}. "
"These metrics will not contribute to the summary score."
)
self.metrics = active_metrics
return self
@model_validator(mode="after")
def validate_all_metrics(self) -> "FormulationSummaryScoreConfig":
"""Ensure that all metrics have valid bounds and weights."""
for name, metric in self.metrics.items():
if (
metric.upper is not None
and metric.lower is not None
and metric.upper <= metric.lower
):
msg = f"Invalid bounds for metric '{name}': upper={metric.upper}, lower={metric.lower}"
logger.error(msg)
raise ValueError(msg)
if metric.weight is not None and (
metric.weight < 0.0 or metric.weight > 1.0
):
msg = f"Invalid weight for metric '{name}': {metric.weight}. Must be between 0.0 and 1.0."
logger.error(msg)
raise ValueError(msg)
return self
@model_validator(mode="after")
def check_weights_sum_to_one(self) -> "FormulationSummaryScoreConfig":
"""Ensure that the sum of all metric weights equals 1.0."""
total_weight = sum(metric.weight for metric in self.metrics.values())
if abs(total_weight - 1.0) > 1e-6:
msg = (
f"The sum of all metric weights must equal 1.0, but got {total_weight}"
)
logger.error(msg)
raise ValueError(msg)
return self
[docs]
class FormulationCostConfig(BaseModel):
"""Computational cost of each formulation."""
file: str | None = Field(
description="Path to CSV file with formulation costs. If provided, costs will be read from this file.",
examples="formulation_costs_secs_per_catchment.csv",
default=None,
)
costs: Dict[str, float] | None = Field(
description="Dictionary of formulation costs, keyed by formulation name. If `file` is provided, this is ignored.",
examples={"noah-owp-modular ueb cfe-x t-route": 10},
default=None,
)
[docs]
class FormulationOutputConfig(BaseModel):
"""Output configuration for formulation regionalization."""
formulation: BaseOutputConfig = Field(
description="Output configurations for the selected formulations.",
default_factory=BaseOutputConfig,
examples={
"save": True,
"path": "{base_dir}/outputs/{run_name}/formulations",
"stem": "form_{domain}_vpu{vpu_list}",
"stem_suffix": "_pars", # suffix for the formulation file with parameters
"format": "parquet",
"plots": {
"spatial_map": True, # whether to create spatial map of selected formulations & scores
"histogram": True, # whether to create histogram of scores
},
"plot_path": "{base_dir}/outputs/{run_name}/formulations/plots",
},
)
config_final: BaseOutputConfig = Field(
description=(
"Output configuration for the final configuration file after processing, with placeholders resolved."
),
examples={
"save": True,
"path": "{base_dir}/outputs/{run_name}/config_formreg_final.yaml",
},
)
summary_score: BaseOutputConfig = Field(
description="Output configurations for the summary score.",
examples={
"save": True,
"path": "{base_dir}/outputs/{run_name}/summary_score",
"stem": "score_{domain}_vpu{vpu_list}",
"stem_suffix": "_all_gages", # suffix for the summary score file containing all gages in the domain
"format": "parquet",
"plots": {"histogram": True, "spatial_map": True},
"plot_path": "{base_dir}/outputs/{run_name}/summary_score/plots",
},
)
[docs]
class Config(BaseConfig):
"""Top-level configuration for formulation regionalization."""
general: FormulationGeneralSettings = Field(
description="General settings for formulation regionalization",
default_factory=FormulationGeneralSettings,
)
spatial_unit: FormulationSpatialUnitConfig = Field(
description="Spatial discretization settings for formulation regionalization.",
default_factory=FormulationSpatialUnitConfig,
)
summary_score: FormulationSummaryScoreConfig = Field(
description="Summary score computation configuration for formulation regionalization.",
default_factory=FormulationSummaryScoreConfig,
)
formulation_cost: FormulationCostConfig = Field(
description="Computational cost configuration for each formulation.",
default_factory=FormulationCostConfig,
)
output: FormulationOutputConfig = Field(
description="Output configuration for formulation regionalization.",
default_factory=FormulationOutputConfig,
)