Observational Data Staging Workflow
This notebook stages selected SEANOE Hvalfj\u00f6r\u00f0ur datasets into the project data layout for a run-dated snapshot. It defines source links and output locations, downloads raw files (ZIP and XLSX), extracts archives when needed, copies tabular files into staged folders, and performs a quick preview to confirm expected outputs.
# Cell 1: Configuration
from pathlib import Path
from datetime import date
DATASETS = {
"ctd_mooring_qc": {
"source_id": "seanoe_113246",
"doi": "10.17882/113246",
"dataset_page": "https://www.seanoe.org/data/01021/113246/",
"url": "https://www.seanoe.org/data/01021/113246/data/127701.zip",
"file_type": "zip",
},
"adcp_mooring_qc": {
"source_id": "seanoe_113246",
"doi": "10.17882/113246",
"dataset_page": "https://www.seanoe.org/data/01021/113246/",
"url": "https://www.seanoe.org/data/01021/113246/data/127702.zip",
"file_type": "zip",
},
"ctd_profiles_qc": {
"source_id": "seanoe_110439",
"doi": "10.17882/110439",
"dataset_page": "https://www.seanoe.org/data/00993/110439/",
"url": "https://www.seanoe.org/data/00993/110439/data/124334.xlsx",
"file_type": "xlsx",
},
"discrete_samples_qc": {
"source_id": "seanoe_110401",
"doi": "10.17882/110401",
"dataset_page": "https://www.seanoe.org/data/00992/110401/",
"url": "https://www.seanoe.org/data/00992/110401/data/124314.xlsx",
"file_type": "xlsx",
},
}
RUN_DATE = date.today().isoformat()
PROJECT_DATA_ROOT = Path("/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data")
RAW_RUN_DIR = PROJECT_DATA_ROOT / "raw" / "seanoe_hvalfjordur" / RUN_DATE
STAGED_RUN_DIR = PROJECT_DATA_ROOT / "staged" / "seanoe_hvalfjordur" / RUN_DATE
MANIFEST_CSV = PROJECT_DATA_ROOT / "staged" / "seanoe_hvalfjordur" / f"manifest_{RUN_DATE}.csv"
RAW_RUN_DIR.mkdir(parents=True, exist_ok=True)
STAGED_RUN_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_RUN_DIR:", RAW_RUN_DIR)
print("STAGED_RUN_DIR:", STAGED_RUN_DIR)
print("Datasets configured:", list(DATASETS))RAW_RUN_DIR: /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27
STAGED_RUN_DIR: /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27
Datasets configured: ['ctd_mooring_qc', 'adcp_mooring_qc', 'ctd_profiles_qc', 'discrete_samples_qc']
Step 1: Utility Functions¶
These helpers provide reusable functions for checksum generation, downloading files, extracting ZIP archives, and staging non-archive files.
# Cell 2: Helpers
import csv
import hashlib
import shutil
import urllib.request
import zipfile
def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
def download_file(url: str, dest: Path) -> Path:
if dest.exists():
print(f"Already exists, skipping download: {dest.name}")
return dest
dest.parent.mkdir(parents=True, exist_ok=True)
print(f"Downloading: {url}")
with urllib.request.urlopen(url) as response, dest.open("wb") as out:
shutil.copyfileobj(response, out)
print(f"Saved: {dest}")
return dest
def unzip_file(zip_path: Path, extract_to: Path) -> list[Path]:
extract_to.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(extract_to)
extracted = [extract_to / name for name in zf.namelist()]
return extracted
def copy_file_to_stage(source_path: Path, staged_dir: Path) -> list[Path]:
staged_dir.mkdir(parents=True, exist_ok=True)
dest = staged_dir / source_path.name
shutil.copy2(source_path, dest)
return [dest]Step 2: Download Raw Files¶
This step fetches each configured source file into a run-specific raw data directory, preserving dataset grouping.
# Cell 3: Download raw files
downloaded = {}
for key, cfg in DATASETS.items():
raw_dataset_dir = RAW_RUN_DIR / cfg["source_id"]
dest = raw_dataset_dir / f"{key}.{cfg['file_type']}"
downloaded[key] = download_file(cfg["url"], dest)
downloadedAlready exists, skipping download: ctd_mooring_qc.zip
Already exists, skipping download: adcp_mooring_qc.zip
Already exists, skipping download: ctd_profiles_qc.xlsx
Already exists, skipping download: discrete_samples_qc.xlsx
{'ctd_mooring_qc': PosixPath('/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27/seanoe_113246/ctd_mooring_qc.zip'),
'adcp_mooring_qc': PosixPath('/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27/seanoe_113246/adcp_mooring_qc.zip'),
'ctd_profiles_qc': PosixPath('/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27/seanoe_110439/ctd_profiles_qc.xlsx'),
'discrete_samples_qc': PosixPath('/anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/raw/seanoe_hvalfjordur/2026-04-27/seanoe_110401/discrete_samples_qc.xlsx')}Step 3: Stage Downloaded Files¶
ZIP archives are extracted into staged folders, while non-archive files are copied directly into their dataset-specific staging locations.
# Cell 4: Stage downloaded files (extract ZIP, copy other formats)
staged_index = {}
for key, local_path in downloaded.items():
cfg = DATASETS[key]
out_dir = STAGED_RUN_DIR / cfg["source_id"] / key
if cfg["file_type"] == "zip":
staged_files = unzip_file(local_path, out_dir)
action = "extracted"
else:
staged_files = copy_file_to_stage(local_path, out_dir)
action = "copied"
staged_index[key] = staged_files
print(f"{key}: {action} {len(staged_files)} item(s) to {out_dir}")ctd_mooring_qc: extracted 19 item(s) to /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27/seanoe_113246/ctd_mooring_qc
adcp_mooring_qc: extracted 15 item(s) to /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27/seanoe_113246/adcp_mooring_qc
ctd_profiles_qc: copied 1 item(s) to /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27/seanoe_110439/ctd_profiles_qc
discrete_samples_qc: copied 1 item(s) to /anvil/projects/x-ees250129/x-uheede/C-Star-in-Hvalfjordur/data/staged/seanoe_hvalfjordur/2026-04-27/seanoe_110401/discrete_samples_qc
Step 4: Quick Staging Preview¶
This quick check prints a small sample of staged paths to verify extraction results before moving on.
# Cell 6 (optional): quick preview of what was staged
for source_dir in sorted(STAGED_RUN_DIR.iterdir()):
if source_dir.is_dir():
sample = list(source_dir.rglob("*"))[:8]
print(f"\n{source_dir.name} (showing up to 8 paths):")
for p in sample:
print(" -", p.relative_to(STAGED_RUN_DIR))
seanoe_110401 (showing up to 8 paths):
- seanoe_110401/discrete_samples_qc
- seanoe_110401/discrete_samples_qc/discrete_samples_qc.xlsx
seanoe_110439 (showing up to 8 paths):
- seanoe_110439/ctd_profiles_qc
- seanoe_110439/ctd_profiles_qc/ctd_profiles_qc.xlsx
seanoe_113246 (showing up to 8 paths):
- seanoe_113246/adcp_mooring_qc
- seanoe_113246/ctd_mooring_qc
- seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc
- seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVNV3_ADCP_33m_20250217_20250428.nc
- seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVSA2_ADCP_30m_20240814_20250217.nc
- seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVIN2_ADCP_59m_20240814_20250217.nc
- seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVNV2_ADCP_33m_20240815_20250116.nc
- seanoe_113246/adcp_mooring_qc/Mooring_data_Final_.nc/HVNA3_ADCP_26m_20250217_20250428.nc