Source code for tseda.dataloader.hyndman_examples_data_loader

"""Downloader for Hyndman-based example datasets used by tseda."""

from __future__ import annotations

from pathlib import Path

import pandas as pd



[docs]
class HyndmanExamplesDataLoader:
    """Download and normalize Hyndman example files into the local ``data`` directory."""

    DATASET_URLS = {
        "hyndman_goog_daily_close.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_goog_daily_close.csv",
        "hyndman_hyndsight_daily_pageviews.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_hyndsight_daily_pageviews.csv",
        "hyndman_arrivals_quarterly_japan.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_arrivals_quarterly_japan.csv",
        "hyndman_usconsumption_quarterly_consumption.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_usconsumption_quarterly_consumption.csv",
        "hyndman_sunspot_monthly_area.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_sunspot_monthly_area.csv",
    }


[docs]
    def __init__(self, output_dir: str = "data") -> None:
        """Set destination directory for downloaded example files."""
        self.output_dir = Path(output_dir)


    @staticmethod
    def _normalize_two_column_time_series(frame: pd.DataFrame) -> pd.DataFrame:
        """Return a strict two-column format accepted by tseda uploads."""
        if frame.empty or frame.shape[1] < 2:
            raise ValueError("Hyndman source dataset must contain at least two columns")

        normalized = frame.iloc[:, :2].copy()
        normalized.columns = ["timestamp", "value"]
        normalized["timestamp"] = pd.to_datetime(normalized["timestamp"], errors="coerce")
        normalized["value"] = pd.to_numeric(normalized["value"], errors="coerce")
        normalized = normalized.dropna(subset=["timestamp", "value"]).sort_values("timestamp")
        normalized["timestamp"] = normalized["timestamp"].dt.strftime("%Y-%m-%d")

        if len(normalized) > 2000:
            normalized = normalized.iloc[-2000:].copy()

        return normalized


[docs]
    def download_and_prepare_one(self, file_name: str, source_url: str) -> Path:
        """Download one dataset from URL, normalize it, and write to output directory."""
        frame = pd.read_csv(source_url)
        normalized = self._normalize_two_column_time_series(frame)

        self.output_dir.mkdir(parents=True, exist_ok=True)
        output_path = self.output_dir / file_name
        normalized.to_csv(output_path, index=False)
        return output_path



[docs]
    def download_and_prepare_all(self) -> list[Path]:
        """Download, normalize, and write all configured Hyndman example datasets."""
        written_files: list[Path] = []
        for file_name, source_url in self.DATASET_URLS.items():
            written_files.append(self.download_and_prepare_one(file_name, source_url))
        return written_files