Source code for tseda.dataloader.uci_air_quality_data_loader

"""Data loader for UCI Air Quality hourly data."""

from __future__ import annotations

from io import BytesIO
from pathlib import Path
from urllib.request import urlopen
from zipfile import ZipFile

import pandas as pd

from .local_dataloader import LocalDataLoader


[docs] class UCIAirQualityDataLoader(LocalDataLoader): """Download, normalize, and expose UCI Air Quality data as a signal series.""" DATASET_URL = "https://archive.ics.uci.edu/static/public/360/air+quality.zip" CSV_NAME_IN_ZIP = "AirQualityUCI.csv"
[docs] def __init__(self, file_path: str = "data/uci_air_quality_hourly_co.csv") -> None: """Configure output location for prepared hourly CO series.""" super().__init__(file_path)
@staticmethod def _normalize_air_quality(frame: pd.DataFrame) -> pd.DataFrame: """Return a strict two-column hourly dataset in date/signal format.""" required = ["Date", "Time", "CO(GT)"] missing = [c for c in required if c not in frame.columns] if missing: raise ValueError(f"UCI Air Quality is missing expected columns: {missing}") normalized = frame[required].copy() normalized["date"] = pd.to_datetime( normalized["Date"].astype(str) + " " + normalized["Time"].astype(str), format="%d/%m/%Y %H.%M.%S", errors="coerce", ) normalized["signal"] = pd.to_numeric(normalized["CO(GT)"], errors="coerce") normalized.loc[normalized["signal"] == -200, "signal"] = pd.NA normalized = ( normalized[["date", "signal"]] .dropna(subset=["date", "signal"]) .sort_values("date") ) if len(normalized) > 2000: normalized = normalized.iloc[-2000:].copy() normalized["date"] = normalized["date"].dt.strftime("%Y-%m-%d %H:%M:%S") return normalized
[docs] def download_and_prepare(self) -> pd.DataFrame: """Download UCI Air Quality zip, prepare data, and write CSV to data directory.""" with urlopen(self.DATASET_URL, timeout=60) as response: payload = response.read() with ZipFile(BytesIO(payload)) as zf: with zf.open(self.CSV_NAME_IN_ZIP) as csv_file: frame = pd.read_csv(csv_file, sep=";", decimal=",") prepared = self._normalize_air_quality(frame) output_path = Path(self.file_path) output_path.parent.mkdir(parents=True, exist_ok=True) prepared.to_csv(output_path, index=False) return prepared
[docs] def load_air_quality(self, refresh: bool = False) -> pd.DataFrame: """Load prepared air-quality data; download first if missing or refresh requested. Returns: DataFrame with columns ``date`` and ``signal``. Returns an empty DataFrame if source data cannot be loaded. """ output_path = Path(self.file_path) if refresh or not output_path.exists(): return self.download_and_prepare() data = self.load_data() if data.empty: return pd.DataFrame(columns=["date", "signal"]) normalized = data.iloc[:, :2].copy() normalized.columns = ["date", "signal"] normalized["date"] = pd.to_datetime(normalized["date"], errors="coerce") normalized["signal"] = pd.to_numeric(normalized["signal"], errors="coerce") normalized = normalized.dropna(subset=["date", "signal"]).sort_values("date") return normalized
[docs] def get_series(self, refresh: bool = False) -> pd.Series: """Return the air-quality CO signal as a pandas Series indexed by date. Returns: ``signal`` series indexed by ``date``. Returns an empty float series when no data is available. """ data = self.load_air_quality(refresh=refresh) if data.empty: print("No data available to extract series.") return pd.Series(dtype=float) data = data.copy() data.index = pd.to_datetime(data["date"]) return data["signal"]