Hive Metastore to Unity Catalog: The Complete Migration Guide

At Revantage Asia, I led the migration of 5TB+ of data assets from legacy Hive Metastore to Databricks Unity Catalog (UC). This is the complete technical playbook.

Why Unity Catalog?

Unity Catalog (UC) gives you what Hive Metastore never could:

| Feature | Hive Metastore | Unity Catalog | |---------|---------------|---------------| | Governance | Per-workspace | Cross-workspace | | Column-level security | No | Yes | | Data lineage | No | Built-in | | Row-level filters | Manual | Native | | Audit logs | Basic | Fine-grained | | Sharing | Complex | Delta Sharing |

Pre-Migration Assessment

Before touching anything, run this assessment script:

# assessment.py
from pyspark.sql import SparkSession
from databricks.sdk import WorkspaceClient

spark = SparkSession.builder.getOrCreate()
w = WorkspaceClient()

def assess_hive_metastore():
    """Inventory all HMS objects for migration planning"""
    
    # List all databases
    databases = spark.sql("SHOW DATABASES").collect()
    
    inventory = []
    for db in databases:
        db_name = db.databaseName
        tables = spark.sql(f"SHOW TABLES IN {db_name}").collect()
        
        for table in tables:
            table_name = table.tableName
            
            try:
                props = spark.sql(
                    f"DESCRIBE EXTENDED {db_name}.{table_name}"
                ).collect()
                
                props_dict = {r.col_name: r.data_type for r in props}
                
                inventory.append({
                    "database": db_name,
                    "table": table_name,
                    "type": props_dict.get("Type", "unknown"),
                    "format": props_dict.get("Provider", "unknown"),
                    "location": props_dict.get("Location", ""),
                    "is_delta": props_dict.get("Provider", "").lower() == "delta",
                    "is_managed": "dbfs:/user/hive" in props_dict.get("Location", ""),
                })
            except Exception as e:
                print(f"Error assessing {db_name}.{table_name}: {e}")
    
    return inventory

inventory = assess_hive_metastore()
print(f"Total tables: {len(inventory)}")
print(f"Delta tables: {sum(1 for t in inventory if t['is_delta'])}")
print(f"Non-Delta (need conversion): {sum(1 for t in inventory if not t['is_delta'])}")

The Migration Strategy: Phased Waves

Wave 1: Non-Production Schemas

Wave 2: Read-heavy Production Tables

Wave 3: Write-critical Production Tables (with brief maintenance window)

Step 1: Upgrade Non-Delta Tables to Delta

UC works best with Delta Lake. Convert Parquet/CSV tables first:

# convert_to_delta.py
def upgrade_table_to_delta(database: str, table: str, dry_run: bool = True):
    """Convert non-Delta table to Delta format"""
    
    # Check current format
    desc = spark.sql(f"DESCRIBE EXTENDED {database}.{table}").collect()
    props = {r.col_name: r.data_type for r in desc}
    
    if props.get("Provider", "").lower() == "delta":
        print(f"  ✓ {database}.{table} already Delta, skipping")
        return
    
    location = props.get("Location", "")
    
    if not dry_run:
        # Convert in-place for Parquet
        spark.sql(f"""
            CONVERT TO DELTA parquet.`{location}`
        """)
        
        # Recreate table pointing to Delta location
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {database}.{table}_delta
            USING DELTA
            LOCATION '{location}'
        """)
        
        # Rename with atomic operation
        spark.sql(f"ALTER TABLE {database}.{table} RENAME TO {database}.{table}_old")
        spark.sql(f"ALTER TABLE {database}.{table}_delta RENAME TO {database}.{table}")
        
        print(f"  ✓ Converted {database}.{table} to Delta")
    else:
        print(f"  [DRY RUN] Would convert {database}.{table} (format: {props.get('Provider')})")

Step 2: Catalog/Schema Structure in UC

Map your HMS databases to UC three-level namespace:

# create_uc_structure.py
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.catalog import SecurableType

w = WorkspaceClient()

# Create catalog per business domain
def create_catalog(catalog_name: str, comment: str):
    try:
        w.catalogs.create(
            name=catalog_name,
            comment=comment,
            properties={"managed_by": "terraform", "env": "production"}
        )
        print(f"✓ Created catalog: {catalog_name}")
    except Exception as e:
        if "already exists" in str(e):
            print(f"  Catalog {catalog_name} already exists")
        else:
            raise

# Create schemas (map from HMS databases)
def create_schema(catalog: str, schema: str, comment: str = ""):
    w.schemas.create(
        catalog_name=catalog,
        name=schema,
        comment=comment
    )

# Example structure
create_catalog("prod_finance", "Finance domain - production data")
create_catalog("prod_operations", "Operations domain - production data")
create_schema("prod_finance", "accounting", "GL and AP/AR data")
create_schema("prod_finance", "reporting", "Finance reporting layer")

Step 3: Migrate Tables with ACL Preservation

This is the critical part — preserving permissions:

# migrate_tables.py
import json
from databricks.sdk.service.catalog import PermissionsChange, SecurableType

def get_hms_permissions(database: str, table: str) -> dict:
    """Extract HMS permissions via SHOW GRANTS"""
    grants = spark.sql(f"SHOW GRANTS ON TABLE {database}.{table}").collect()
    return {
        "principals": [
            {"principal": g.Principal, "privilege": g.ActionType}
            for g in grants
        ]
    }

def migrate_table_to_uc(
    hms_db: str, 
    hms_table: str,
    uc_catalog: str,
    uc_schema: str,
    uc_table: str = None
):
    uc_table = uc_table or hms_table
    full_uc_name = f"{uc_catalog}.{uc_schema}.{uc_table}"
    
    print(f"Migrating {hms_db}.{hms_table} → {full_uc_name}")
    
    # 1. Get HMS location
    desc = spark.sql(f"DESCRIBE EXTENDED {hms_db}.{hms_table}").collect()
    props = {r.col_name: r.data_type for r in desc}
    location = props["Location"]
    
    # 2. Capture permissions before migration
    perms = get_hms_permissions(hms_db, hms_table)
    
    # 3. Create table in UC (external table pointing to existing location)
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {full_uc_name}
        USING DELTA
        LOCATION '{location}'
    """)
    
    # 4. Re-apply permissions in UC
    for grant in perms["principals"]:
        try:
            spark.sql(f"""
                GRANT {grant['privilege']} ON TABLE {full_uc_name}
                TO `{grant['principal']}`
            """)
        except Exception as e:
            print(f"  Warning: Could not apply grant {grant}: {e}")
    
    # 5. Upgrade to UC managed table (optional - moves to UC managed storage)
    # spark.sql(f"ALTER TABLE {full_uc_name} SET TBLPROPERTIES ('upgraded_to' = '{full_uc_name}')")
    
    print(f"  ✓ Migrated with {len(perms['principals'])} permission grants")
    return full_uc_name

Step 4: Azure DevOps Pipeline for Automated Migration

# azure-pipelines-uc-migration.yml
stages:
- stage: Assessment
  jobs:
  - job: RunAssessment
    steps:
    - task: DatabricksNotebook@0
      inputs:
        notebookPath: '/Shared/migration/assessment'
        existingClusterId: $(CLUSTER_ID)
        executionResultsVariable: 'assessmentResults'

- stage: NonProd
  dependsOn: Assessment
  jobs:
  - job: MigrateNonProd
    steps:
    - task: DatabricksNotebook@0
      inputs:
        notebookPath: '/Shared/migration/migrate_tables'
        notebookParameters: |
          {
            "env": "nonprod",
            "dry_run": "false",
            "catalogs": "$(NON_PROD_CATALOGS)"
          }

- stage: Production
  dependsOn: NonProd
  jobs:
  - deployment: MigrateProd
    environment: 'databricks-production'
    strategy:
      runOnce:
        deploy:
          steps:
          - task: DatabricksNotebook@0
            inputs:
              notebookPath: '/Shared/migration/migrate_tables'
              notebookParameters: '{"env": "prod", "dry_run": "false"}'

Step 5: Validation

Never declare success without validation:

# validate_migration.py
def validate_migration(hms_table: str, uc_table: str) -> bool:
    """Validate row counts and schema match post-migration"""
    
    # Row count check
    hms_count = spark.sql(f"SELECT COUNT(*) FROM {hms_table}").collect()[0][0]
    uc_count = spark.sql(f"SELECT COUNT(*) FROM {uc_table}").collect()[0][0]
    
    count_match = hms_count == uc_count
    
    # Schema check
    hms_schema = spark.sql(f"DESCRIBE {hms_table}").collect()
    uc_schema = spark.sql(f"DESCRIBE {uc_table}").collect()
    
    schema_match = len(hms_schema) == len(uc_schema)
    
    print(f"  Rows: HMS={hms_count:,} UC={uc_count:,} {'✓' if count_match else '✗'}")
    print(f"  Schema columns: {'✓' if schema_match else '✗'}")
    
    return count_match and schema_match

Key Lessons from 5TB Migration

Never migrate and upgrade storage simultaneously — do Delta conversion first, UC migration second
External tables are your safety net — data stays in ADLS, easy rollback
Test ACL migration on a sandbox — UC principals are case-sensitive, HMS often isn't
Keep HMS running for 30 days post-migration — rollback window for edge cases
Lineage breaks on migration — document this for stakeholders upfront

Scripts available at github.com/suhail39ahmed/databricks-snowflake-devops