Query partitioned lakehouse
Use DuckDB with hive_partitioning=true and union_by_name=true so mixed-schema Parquet batches remain queryable.
"""Example showing safe analytical reads from a partitioned Hive Parquet lakehouse."""
from __future__ import annotations
import os
import duckdb
from imednet.integrations.parquet import hive_parquet_query
base_dir = os.getenv("IMEDNET_LAKEHOUSE_DIR", "/tmp/lakehouse")
query = (
"SELECT study_key, form_key, COUNT(*) AS row_count "
f"FROM ({hive_parquet_query(base_dir)}) records "
"GROUP BY study_key, form_key "
"ORDER BY study_key, form_key"
)
rows = duckdb.connect(":memory:").execute(query).fetchall()
for row in rows:
print(row)