Source code for tseda.dataloader.hyndman_examples_data_loader

"""Downloader for Hyndman-based example datasets used by tseda."""

from __future__ import annotations

from pathlib import Path

import pandas as pd


[docs] class HyndmanExamplesDataLoader: """Download and normalize Hyndman example files into the local ``data`` directory.""" DATASET_URLS = { "hyndman_goog_daily_close.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_goog_daily_close.csv", "hyndman_hyndsight_daily_pageviews.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_hyndsight_daily_pageviews.csv", "hyndman_arrivals_quarterly_japan.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_arrivals_quarterly_japan.csv", "hyndman_usconsumption_quarterly_consumption.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_usconsumption_quarterly_consumption.csv", "hyndman_sunspot_monthly_area.csv": "https://raw.githubusercontent.com/rajivsam/tseda/main/data/hyndman_sunspot_monthly_area.csv", }
[docs] def __init__(self, output_dir: str = "data") -> None: """Set destination directory for downloaded example files.""" self.output_dir = Path(output_dir)
@staticmethod def _normalize_two_column_time_series(frame: pd.DataFrame) -> pd.DataFrame: """Return a strict two-column format accepted by tseda uploads.""" if frame.empty or frame.shape[1] < 2: raise ValueError("Hyndman source dataset must contain at least two columns") normalized = frame.iloc[:, :2].copy() normalized.columns = ["timestamp", "value"] normalized["timestamp"] = pd.to_datetime(normalized["timestamp"], errors="coerce") normalized["value"] = pd.to_numeric(normalized["value"], errors="coerce") normalized = normalized.dropna(subset=["timestamp", "value"]).sort_values("timestamp") normalized["timestamp"] = normalized["timestamp"].dt.strftime("%Y-%m-%d") if len(normalized) > 2000: normalized = normalized.iloc[-2000:].copy() return normalized
[docs] def download_and_prepare_one(self, file_name: str, source_url: str) -> Path: """Download one dataset from URL, normalize it, and write to output directory.""" frame = pd.read_csv(source_url) normalized = self._normalize_two_column_time_series(frame) self.output_dir.mkdir(parents=True, exist_ok=True) output_path = self.output_dir / file_name normalized.to_csv(output_path, index=False) return output_path
[docs] def download_and_prepare_all(self) -> list[Path]: """Download, normalize, and write all configured Hyndman example datasets.""" written_files: list[Path] = [] for file_name, source_url in self.DATASET_URLS.items(): written_files.append(self.download_and_prepare_one(file_name, source_url)) return written_files