Observational Data Staging Workflow

This notebook stages selected SEANOE Hvalfj\u00f6r\u00f0ur datasets into the project data layout for a run-dated snapshot. It defines source links and output locations, downloads raw files (ZIP and XLSX), extracts archives when needed, copies tabular files into staged folders, and performs a quick preview to confirm expected outputs.

# Cell 1: Configuration
from pathlib import Path
from datetime import date

DATASETS = {
    "ctd_mooring_qc": {
        "source_id": "seanoe_113246",
        "doi": "10.17882/113246",
        "dataset_page": "https://www.seanoe.org/data/01021/113246/",
        "url": "https://www.seanoe.org/data/01021/113246/data/127701.zip",
        "file_type": "zip",
    },
    "adcp_mooring_qc": {
        "source_id": "seanoe_113246",
        "doi": "10.17882/113246",
        "dataset_page": "https://www.seanoe.org/data/01021/113246/",
        "url": "https://www.seanoe.org/data/01021/113246/data/127702.zip",
        "file_type": "zip",
    },
    "ctd_profiles_qc": {
        "source_id": "seanoe_110439",
        "doi": "10.17882/110439",
        "dataset_page": "https://www.seanoe.org/data/00993/110439/",
        "url": "https://www.seanoe.org/data/00993/110439/data/124334.xlsx",
        "file_type": "xlsx",
    },
    "discrete_samples_qc": {
        "source_id": "seanoe_110401",
        "doi": "10.17882/110401",
        "dataset_page": "https://www.seanoe.org/data/00992/110401/",
        "url": "https://www.seanoe.org/data/00992/110401/data/124314.xlsx",
        "file_type": "xlsx",
    },
}

RUN_DATE = date.today().isoformat()

PROJECT_DATA_ROOT = Path("/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data")
RAW_RUN_DIR = PROJECT_DATA_ROOT / "raw" / "seanoe_hvalfjordur" / RUN_DATE
STAGED_RUN_DIR = PROJECT_DATA_ROOT / "staged" / "seanoe_hvalfjordur" / RUN_DATE
MANIFEST_CSV = PROJECT_DATA_ROOT / "staged" / "seanoe_hvalfjordur" / f"manifest_{RUN_DATE}.csv"

RAW_RUN_DIR.mkdir(parents=True, exist_ok=True)
STAGED_RUN_DIR.mkdir(parents=True, exist_ok=True)

print("RAW_RUN_DIR:", RAW_RUN_DIR)
print("STAGED_RUN_DIR:", STAGED_RUN_DIR)
print("Datasets configured:", list(DATASETS))

RAW_RUN_DIR: /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27
STAGED_RUN_DIR: /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27
Datasets configured: ['ctd_mooring_qc', 'adcp_mooring_qc', 'ctd_profiles_qc', 'discrete_samples_qc']

Step 1: Utility Functions¶

These helpers provide reusable functions for checksum generation, downloading files, extracting ZIP archives, and staging non-archive files.

# Cell 2: Helpers
import csv
import hashlib
import shutil
import urllib.request
import zipfile

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            h.update(chunk)
    return h.hexdigest()

def download_file(url: str, dest: Path) -> Path:
    if dest.exists():
        print(f"Already exists, skipping download: {dest.name}")
        return dest

    dest.parent.mkdir(parents=True, exist_ok=True)
    print(f"Downloading: {url}")
    with urllib.request.urlopen(url) as response, dest.open("wb") as out:
        shutil.copyfileobj(response, out)
    print(f"Saved: {dest}")
    return dest

def unzip_file(zip_path: Path, extract_to: Path) -> list[Path]:
    extract_to.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(extract_to)
        extracted = [extract_to / name for name in zf.namelist()]
    return extracted

def copy_file_to_stage(source_path: Path, staged_dir: Path) -> list[Path]:
    staged_dir.mkdir(parents=True, exist_ok=True)
    dest = staged_dir / source_path.name
    shutil.copy2(source_path, dest)
    return [dest]

Step 2: Download Raw Files¶

This step fetches each configured source file into a run-specific raw data directory, preserving dataset grouping.

# Cell 3: Download raw files
downloaded = {}

for key, cfg in DATASETS.items():
    raw_dataset_dir = RAW_RUN_DIR / cfg["source_id"]
    dest = raw_dataset_dir / f"{key}.{cfg['file_type']}"
    downloaded[key] = download_file(cfg["url"], dest)

downloaded

Already exists, skipping download: ctd_mooring_qc.zip
Already exists, skipping download: adcp_mooring_qc.zip
Already exists, skipping download: ctd_profiles_qc.xlsx
Already exists, skipping download: discrete_samples_qc.xlsx

{'ctd_mooring_qc': PosixPath('/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27/seanoe_113246/ctd_mooring_qc.zip'),
 'adcp_mooring_qc': PosixPath('/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27/seanoe_113246/adcp_mooring_qc.zip'),
 'ctd_profiles_qc': PosixPath('/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27/seanoe_110439/ctd_profiles_qc.xlsx'),
 'discrete_samples_qc': PosixPath('/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27/seanoe_110401/discrete_samples_qc.xlsx')}

Step 3: Stage Downloaded Files¶

ZIP archives are extracted into staged folders, while non-archive files are copied directly into their dataset-specific staging locations.

# Cell 4: Stage downloaded files (extract ZIP, copy other formats)
staged_index = {}

for key, local_path in downloaded.items():
    cfg = DATASETS[key]
    out_dir = STAGED_RUN_DIR / cfg["source_id"] / key

    if cfg["file_type"] == "zip":
        staged_files = unzip_file(local_path, out_dir)
        action = "extracted"
    else:
        staged_files = copy_file_to_stage(local_path, out_dir)
        action = "copied"

    staged_index[key] = staged_files
    print(f"{key}: {action} {len(staged_files)} item(s) to {out_dir}")

ctd_mooring_qc: extracted 19 item(s) to /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27/seanoe_113246/ctd_mooring_qc
adcp_mooring_qc: extracted 15 item(s) to /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27/seanoe_113246/adcp_mooring_qc
ctd_profiles_qc: copied 1 item(s) to /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27/seanoe_110439/ctd_profiles_qc
discrete_samples_qc: copied 1 item(s) to /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27/seanoe_110401/discrete_samples_qc

Step 4: Quick Staging Preview¶

This quick check prints a small sample of staged paths to verify extraction results before moving on.

# Cell 6 (optional): quick preview of what was staged
for source_dir in sorted(STAGED_RUN_DIR.iterdir()):
    if source_dir.is_dir():
        sample = list(source_dir.rglob("*"))[:8]
        print(f"\n{source_dir.name} (showing up to 8 paths):")
        for p in sample:
            print(" -", p.relative_to(STAGED_RUN_DIR))


seanoe_110401 (showing up to 8 paths):
 - seanoe_110401/discrete_samples_qc
 - seanoe_110401/discrete_samples_qc/discrete_samples_qc.xlsx

seanoe_110439 (showing up to 8 paths):
 - seanoe_110439/ctd_profiles_qc
 - seanoe_110439/ctd_profiles_qc/ctd_profiles_qc.xlsx

seanoe_113246 (showing up to 8 paths):
 - seanoe_113246/adcp_mooring_qc
 - seanoe_113246/ctd_mooring_qc
 - seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc
 - seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVNV3_ADCP_33m_20250217_20250428.nc
 - seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVSA2_ADCP_30m_20240814_20250217.nc
 - seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVIN2_ADCP_59m_20240814_20250217.nc
 - seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVNV2_ADCP_33m_20240815_20250116.nc
 - seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVNA3_ADCP_26m_20250217_20250428.nc