diff --git a/skills/ontology-stack-builder/LICENSE b/skills/ontology-stack-builder/LICENSE new file mode 100644 index 00000000..0ccb377f --- /dev/null +++ b/skills/ontology-stack-builder/LICENSE @@ -0,0 +1,184 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship made available under + the License, as indicated by a copyright notice that is included in + or attached to the work (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other transformations + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean, as submitted to the Licensor for inclusion + in the Work by the copyright owner or by an individual or Legal Entity + authorized to submit on behalf of the copyright owner. For the purposes + of this definition, "submitted" means any form of electronic, verbal, + or written communication sent to the Licensor or its representatives, + including but not limited to communication on electronic mailing lists, + source code control systems, and issue tracking systems that are managed + by, or on behalf of, the Licensor for the purpose of discussing and + improving the Work, but excluding communication that is conspicuously + marked or otherwise designated in writing by the copyright owner as + "Not a Contribution." + + "Contributor" shall mean Licensor and any Legal Entity on behalf of + whom a Contribution has been received by the Licensor and subsequently + incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a cross-claim + or counterclaim in a lawsuit) alleging that the Work or any + Contribution embodied within the Work constitutes direct or contributory + patent infringement, then any patent licenses granted to You under + this License for that Work shall terminate as of the date such + litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative + Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, You must include a readable copy of the + attribution notices contained within such NOTICE file, in + at least one of the following places: within a NOTICE text + file distributed as part of the Derivative Works; within + the Source form or documentation, if provided along with the + Derivative Works; or, within a display generated by the + Derivative Works, if and wherever such third-party notices + normally appear. The contents of the NOTICE file are for + informational purposes only and do not modify the License. + You may add Your own attribution notices within Derivative + Works that You distribute, alongside or as an addendum to + the NOTICE text from the Work, provided that such additional + attribution notices cannot be construed as modifying the License. + + You may add Your own license statement for Your modifications and + may provide additional grant of rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the + Contribution, either on an unmodified basis, with modifications, + or as part of a larger work. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or all other + commercial damages or losses), even if such Contributor has been + advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2025 Snowflake Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/skills/ontology-stack-builder/README.md b/skills/ontology-stack-builder/README.md new file mode 100644 index 00000000..0846b112 --- /dev/null +++ b/skills/ontology-stack-builder/README.md @@ -0,0 +1,212 @@ +

+ Ontology-on-Snowflake — 8 Classes, 8 Relations, Fully Mapped +

+ +# ontology-stack-builder + +A [Cortex Code](https://docs.snowflake.com/en/user-guide/cortex-code/cortex-code) skill that generates the full **Ontology-on-Snowflake** stack from any relational schema through a 7-phase gated workflow. + +You bring your Snowflake tables and business questions. The skill builds the rest — metadata, abstract views, semantic models, and a Cortex Agent — in a single conversational session. + +For a walkthrough of how this skill works and how to use it, see [Ontology on Snowflake: From Architecture to Deployment with a Cortex Code Skill](https://medium.com/snowflake/ontology-on-snowflake-from-architecture-to-deployment-with-a-cortex-code-skill-197866ce9c9f). + +For the architectural foundations, see the original blog series: +- [Part 1 — Overview and Data Model](https://medium.com/snowflake/ontology-on-snowflake-part-1-overview-and-data-model-9e8eeaac7363) +- [Part 2 — Semantic Models](https://medium.com/snowflake/ontology-on-snowflake-part-2-semantic-models-9aa0fa9b9312) +- [Part 3 — AI-Powered Intelligence](https://medium.com/snowflake/ontology-on-snowflake-part-3-ai-powered-intelligence-bbace87c6be1) + +--- + +## What It Builds + +| Layer | What Gets Created | +|-------|-------------------| +| **L1 — Physical Storage** | Source table views or KG_NODE/KG_EDGE graph tables | +| **L2 — Ontology Metadata** | ~22 metadata tables with auto-populated seed data | +| **L3 — Abstract Views** | Per-class abstract views, hierarchy views, view-generator procedure | +| **L4 — Semantic Views** | Base semantic view (reused from existing or created over source tables) + ontology-layer semantic views | +| **L5 — Cortex Agent** | Agent with intent-routed tools: base + ontology + optional graph UDFs | + +--- + +## Installation + +```bash +# Clone from Snowflake-Labs +git clone https://github.com/Snowflake-Labs/cortex-code-skills.git +cp -r cortex-code-skills/skills/ontology-stack-builder ~/.snowflake/cortex/skills/ontology-stack-builder + +# Or project-level (single project) +cp -r cortex-code-skills/skills/ontology-stack-builder .cortex/skills/ontology-stack-builder +``` + +### Prerequisites + +- **Cortex Code** CLI +- **Snowflake account** with CREATE TABLE, CREATE VIEW, CREATE PROCEDURE permissions +- **Python 3.10+** (dependencies managed automatically via `uv`) + +--- + +## Usage + +A single prompt kicks off the workflow: + +``` +Use ontology-stack-builder skill. Build an ontology stack on MY_DB.MY_SCHEMA using these inputs: + +Database: MY_DB, Schema: MY_SCHEMA +Source tables: TABLE_A, TABLE_B, TABLE_C +Ontology name: MY_ONTOLOGY +Path: Direct table path +Business questions: What products does each customer buy? How are customers segmented? +Semantic views: Ontology + Metadata +``` + +### Prompt Fields + +| Field | Description | +|-------|-------------| +| **Database / Schema** | Snowflake location for all generated objects | +| **Source tables** | Existing tables to introspect for ontology design | +| **Ontology name** | Prefix for all generated objects (e.g., `STOCK`) | +| **Path** | *Knowledge Graph* — universal KG_NODE/KG_EDGE tables with graph analytics; duplicates data. *Direct Table* — views over existing tables; no data movement | +| **Business questions** | Natural-language questions that guide semantic model creation | +| **Semantic views** | Which ontology-layer models to create on top of the base: *Ontology* (abstract reasoning), *Metadata* (governance/discovery), *KG* (concrete graph queries, KG path only). A base semantic view over the source tables is always present — reused from existing or created in Phase 4.5 | + +All fields are optional — the skill will ask for anything you omit. + +--- + +## The 7-Phase Workflow + +Every phase ends with a mandatory gate — the skill stops and asks for your approval before continuing. + +### Phase 1 — Gather Inputs +Collects and validates database, schema, tables, business questions, path choice, and ontology name. Discovers existing semantic views in the target schema and asks whether to reuse one as the base or create from scratch. Presents a structured summary for confirmation. + +### Phase 2 — Analyze & Recommend Ontology +Introspects source tables (or parses an OWL file) and proposes classes, relations, and a class hierarchy. If an existing semantic view was found in Phase 1, its curated metadata (column descriptions, relationships, metrics) enriches the proposals. You review and adjust before proceeding. + +### Phase 3 — Visualize, Modify & Confirm +Launches an interactive Streamlit visualizer with three tabs — Hierarchy (expandable trees), Ontology Graph (interactive node-edge diagram), and Coverage (design structure). A sidebar editor lets you add, remove, or modify classes and relations visually. + +### Phase 4 — Generate & Deploy +Generates SQL for all Layer 1-3 artifacts, runs a completeness check, and deploys to Snowflake. After deployment, generates a coverage manifest and re-launches the visualizer with four coverage states: green (mapped), blue (covered by ancestor), red (unmapped), gray (abstract). + +### Phase 4.5 — Ensure Base Semantic View +If you have an existing semantic view over your source tables, the skill reuses it. Otherwise, it delegates to the native `semantic-view` skill to create a base semantic view covering your source tables directly. This ensures the final agent always has a tool for concrete data queries. + +### Phase 5 — Ontology Semantic Views +Delegates to the native `semantic-view` skill to create ontology-layer models (KG, Ontology, Metadata) over the objects deployed in Phase 4. Tests each model against the business questions from Phase 1. + +### Phase 6 — Cortex Agent +Delegates to the native `cortex-agent` skill to create the orchestration layer with intent-routed tools — base semantic (Phase 4.5) + ontology-layer semantics (Phase 5) + optional graph tools. + +### Phase 7 — End-to-End Validation +Validates the full stack: row counts, sample queries, semantic view checks, and an end-to-end agent test. + +--- + +## Starting Points + +The skill adapts to what you already have: + +### By Ontology Source + +| Path | When to Use | +|------|-------------| +| **Schema-First Discovery** | You have Snowflake tables but no ontology. The skill analyzes your schema and proposes one. | +| **OWL Import** | You have a formal ontology (OWL, RDF, Turtle, N-Triples, N3). The skill parses it and maps to your tables. | +| **Hybrid** | Start with schema discovery, export, refine externally, re-import. | + +### By Existing Semantics + +| Scenario | What Happens | +|----------|-------------| +| **Have existing semantic view** | The skill discovers it in Phase 1, reuses it as the base semantic tool, and skips Phase 4.5. Existing metadata (column descriptions, relationships, metrics) enriches the ontology proposals in Phase 2. | +| **Tables only (no semantic)** | The skill creates a base semantic view in Phase 4.5 via the `semantic-view` skill before building ontology-layer semantics. | + +Both scenarios also support KG path or direct-table path — the 2x2 combination (KG/Direct x Has Semantic/No Semantic) is fully handled. + +### End-State Asset Matrix + +The table below shows what gets created for each combination of starting point. Rows are grouped by layer; columns represent the four primary paths. Optional KG-only features are shown separately. + +**Core paths (always created):** + +| Layer | Artifact | KG + No Semantic | KG + Existing Semantic | Direct + No Semantic | Direct + Existing Semantic | +|-------|----------|:---:|:---:|:---:|:---:| +| **L1** | KG_NODE, KG_EDGE tables | Y | Y | — | — | +| **L1** | V_{CLASS} entity views | Y (from KG_NODE) | Y (from KG_NODE) | Y (over source tables) | Y (over source tables) | +| **L1** | V_{REL} relationship views | Y (from KG_EDGE) | Y (from KG_EDGE) | Y (over source tables) | Y (over source tables) | +| **L2** | ~22 ONT_* metadata tables + seed data | Y | Y | Y | Y | +| **L3** | VW_ONT_{CLASS} abstract views | Y | Y | Y | Y | +| **L3** | VW_ONT_ALL_ENTITIES, hierarchy views | Y | Y | Y | Y | +| **L3** | SP_GENERATE_ONTOLOGY_VIEWS | Y | Y | Y | Y | +| **L4** | Base semantic view (source tables) | Created in Phase 4.5 | Reused from existing | Created in Phase 4.5 | Reused from existing | +| **L4** | Ontology semantic view (VW_ONT_*) | If selected | If selected | If selected | If selected | +| **L4** | Metadata semantic view (ONT_*) | If selected | If selected | If selected | If selected | +| **L4** | KG semantic view (V_* views) | If selected | If selected | — | — | +| **L5** | Cortex Agent | Y | Y | Y | Y | + +**Optional features (KG path only):** + +| Layer | Artifact | Description | +|-------|----------|-------------| +| **L2** | REL_EDGE_INFERRED, ONT_CONSTRAINT_VIOLATION | Created if inference engine selected | +| **L2** | SP_INFER_TRANSITIVE, SP_INFER_INVERSE, SP_RUN_ONTOLOGY_INFERENCE | Created if inference engine selected | +| **L2** | SP_CHECK_CARDINALITY_SINGLE, SP_CHECK_REFERENTIAL | Created if inference engine selected | +| **L5** | EXPAND_DESCENDANTS_TOOL, GET_ANCESTORS_TOOL | Created if graph traversal UDFs selected | +| **L5** | GET_HIERARCHY_PATH_TOOL, GET_DIRECT_CHILDREN_TOOL | Created if graph traversal UDFs selected | +| **L5** | SPCS graph service + 3 service functions | Created if SPCS graph analytics selected | + +**Key differences by path:** +- **KG path** creates physical graph tables (KG_NODE/KG_EDGE) and loads data into them. Concrete views extract typed projections from PROPS. Unlocks inference engine, graph UDFs, SPCS, and KG semantic view. +- **Direct table path** creates no physical tables. Concrete views are thin wrappers over existing source tables. Lighter weight, no data duplication, but no graph analytics. +- **Existing semantic** skips Phase 4.5 entirely and enriches Phase 2 ontology proposals with curated metadata from the existing model. +- **No semantic** creates a new base semantic view in Phase 4.5 via the `semantic-view` skill. + +--- + +## Optional Features + +Not every domain needs every feature. The skill surfaces these as explicit choices during the workflow: + +| Feature | When Offered | What It Does | +|---------|-------------|-------------| +| **Inference Engine** | Phase 4 (KG path) | Stored procedures for transitive closure, inverse relationship materialization, and cardinality/referential constraint checking | +| **Graph Traversal UDFs** | Phase 4 (KG path) | 4 SQL table functions (expand descendants, get ancestors, find hierarchy path, list direct children) — pure SQL, no infrastructure | +| **SPCS Graph Analytics** | Phase 6 (KG path) | Containerized NetworkX service for centrality, community detection, and shortest path via Snowpark Container Services | + +--- + +## Project Structure + +``` +ontology-stack-builder/ +├── SKILL.md # Skill definition (the 7-phase workflow) +├── pyproject.toml # Python dependencies +├── README.md +├── assets/ +│ └── ontology-graph.png # Header image +├── scripts/ +│ ├── introspect_schema.py # Schema-first ontology discovery +│ ├── parse_owl.py # OWL/RDF parser (OWL, RDF, Turtle, N-Triples, N3) +│ ├── generate_ontology_sql.py # SQL generator for Layers 1-3 +│ ├── generate_spcs_scaffolding.py # SPCS graph service scaffolding (optional) +│ └── visualize_ontology.py # Streamlit visualizer with editor and coverage mapping +├── references/ +│ ├── physical_layer_template.sql # KG_NODE/KG_EDGE DDL template +│ ├── metadata_tables_template.sql # ONT_* table DDL template +│ ├── abstract_views_template.sql # VW_ONT_* view patterns +│ ├── semantic_model_template.yaml # Cortex Analyst YAML pattern +│ └── agent_config_template.json # Cortex Agent config pattern +└── specs/ + └── features/ontology-stack-builder/ + ├── requirements.md # REQ-001 through REQ-015 + ├── design.md # Architecture, data flow, script details + └── tasks.md # Implementation task tracking +``` + +Semantic views (L4) and the Cortex Agent (L5) are created by native bundled skills (`semantic-view`, `cortex-agent`), not scripts. diff --git a/skills/ontology-stack-builder/SKILL.md b/skills/ontology-stack-builder/SKILL.md new file mode 100644 index 00000000..b9f8e43b --- /dev/null +++ b/skills/ontology-stack-builder/SKILL.md @@ -0,0 +1,1204 @@ +--- +name: ontology-stack-builder +title: Build Ontology Stack on Snowflake +summary: Generates the full Ontology-on-Snowflake stack from any relational schema through a 7-phase gated workflow. +description: >- + Use for ALL requests that mention: ontology, knowledge graph, semantic layer from schema, + ontology-on-snowflake, abstract views, ontology metadata tables, KG_NODE, KG_EDGE, + build ontology stack, generate ontology from tables, ontology layer generation, + map tables to ontology, generate cortex agent config. + Triggers: build ontology, create ontology stack, schema-to-ontology, OWL import, + knowledge graph on snowflake, create semantic models from schema. + Do NOT use for: simple semantic view creation (use semantic-view skill), + standalone Cortex Agent creation (use cortex-agent skill). +tools: + - snowflake_sql_execute + - snowflake_object_search + - Bash + - Read + - Write + - Edit + - Grep + - Glob + - ask_user_question + - skill +prompt: "$ontology-stack-builder Build an ontology stack on MY_DB.MY_SCHEMA using tables TABLE_A, TABLE_B, TABLE_C" +language: en +status: Published +author: Tianxia Jia +type: community +demo-url: https://medium.com/snowflake/ontology-on-snowflake-from-architecture-to-deployment-with-a-cortex-code-skill-197866ce9c9f +--- + +# Ontology Stack Builder + +Generate the complete Ontology-on-Snowflake stack from any relational schema and set of business questions. + +## When to Use + +- You have Snowflake tables and want to build an ontology-powered analytics layer +- You want Cortex Agent to intelligently route questions across semantic models +- You need abstract ontology views that unify entity types across tables +- You have an OWL ontology file to map onto existing Snowflake tables +- You want graph analytics (centrality, community detection, shortest path) on your data + +## Architecture Overview + +The skill generates a 5-layer stack: + +``` +Layer 5: Cortex Agent (intent routing across semantic models + graph tools) +Layer 4: Semantic Models (KG concrete, Ontology abstract, Metadata governance) +Layer 3: Generated Abstract Views (VW_ONT_* via stored procedure) +Layer 2: Ontology Metadata (~22 ONT_* tables: classes, properties, rules, roles, etc.) +Layer 1: Physical Storage (existing tables, or KG_NODE/KG_EDGE if KG path) + + Concrete Views (V_{CLASS} entity views, V_{REL} relationship views) +``` + +## Workflow Overview + +This skill follows a 7-phase gated workflow. Each phase requires user confirmation before proceeding. + +``` +Phase 1: GATHER INPUTS + ├── Collect schema, business questions, optional OWL + ├── Ask: KG path or direct-table path? + ├── Discover existing semantic views (Step 1b) + ├── Ask: use existing semantic as base, or create from scratch? + └── ⚠️ GATE: Inputs confirmed + │ +Phase 2: ANALYZE & RECOMMEND ONTOLOGY + ├── Introspect schema OR parse OWL + ├── If existing semantic: enrich proposals with curated metadata + ├── Propose classes, relations, mappings + └── ⚠️ GATE: Ontology design confirmed + │ +Phase 3: VISUALIZE, MODIFY & CONFIRM + ├── Show ontology graph (Streamlit agraph) + ├── Show coverage matrix + ├── Sidebar editor: add/delete classes & relations + ├── Click node → inline edit form (label, parent, abstract) + ├── Visual diff: gold border = new, orange = modified + ├── Save changes back to JSON / Undo all + └── ⚠️ GATE: Visual confirmation + │ +Phase 4: GENERATE & DEPLOY ONTOLOGY LAYER (Layers 1-3) + ├── Generate physical tables SQL (if KG path) + ├── Generate concrete entity/relationship views SQL + ├── Generate metadata tables SQL (~22 tables + seed data) + ├── Generate abstract views SQL + ├── Self-validate completeness (count objects vs expected) + ├── ⚠️ GATE: SQL reviewed and approved + └── Execute SQL → tables & views now exist in Snowflake + │ +Phase 4.5: ENSURE BASE SEMANTIC VIEW — via `semantic-view` skill + ├── If existing semantic chosen in Phase 1 → SKIP (already have base) + ├── Otherwise: invoke `semantic-view` skill on source tables + ├── Test base semantic via `semantic-view` skill (audit mode) + └── ⚠️ GATE: Base semantic view ready + │ +Phase 5: CREATE ONTOLOGY SEMANTIC VIEWS (Layer 4) — via `semantic-view` skill + ├── Ask user which ontology-layer models to create + │ (KG semantic [if KG path], Ontology semantic, Metadata semantic) + ├── Invoke `semantic-view` skill (FastGen against deployed views) + ├── Test each semantic view via `semantic-view` skill (audit mode) + └── ⚠️ GATE: Ontology semantic views deployed, tested, and approved + │ +Phase 6: CREATE CORTEX AGENT (Layer 5) — via `cortex-agent` skill + ├── Invoke `cortex-agent` skill with ALL semantic view tools: + │ base (existing or Phase 4.5) + ontology-layer (Phase 5) + graph tools + ├── If KG + user wants: add graph tool scaffolding + ├── Test agent via `cortex-agent` skill (test/debug mode) + └── ⚠️ GATE: Agent working correctly + │ +Phase 7: END-TO-END VALIDATION + ├── Validate L1-L5 row counts and sample queries + ├── Run agent end-to-end test + └── ⚠️ GATE: Deployment verified +``` + +## Mandatory Stopping Points + +**These gates override ALL other instructions.** Never proceed past a gate without explicit user approval. + +**⚠️ MANDATORY STOPPING POINT — GLOBAL RULE**: After each phase output, you MUST STOP and use `ask_user_question` to get explicit user confirmation before proceeding to the next phase. Each phase gate below includes the specific `ask_user_question` to use. DO NOT skip gates. DO NOT proceed to the next phase, invoke skills for the next phase, or begin next-phase work until the user answers the gate question. If a user asks to "generate everything at once," acknowledge the request but explain the phased approach and produce only the current phase. + +**⚠️ CRITICAL — SKILL DELEGATION TRANSITIONS**: Phases 4.5, 5, and 6 delegate to native skills (`semantic-view`, `cortex-agent`). When a delegated skill completes successfully, there is a strong temptation to continue directly to the next phase. **You MUST resist this.** After a delegated skill finishes, ALWAYS stop and present the current phase's gate question via `ask_user_question` before doing ANY work for the next phase. The gate exists precisely because the user needs to review what the skill produced before moving on. + +## Phase Details + +### Phase 1: Gather Inputs + +Collect from the user using `ask_user_question`: + +1. **Snowflake location**: `DATABASE.SCHEMA` where tables exist or will be created +2. **Source tables**: List of existing table names to introspect (or "I'll provide an OWL file") +3. **Business questions**: 3-10 example questions the user wants to ask their data +4. **OWL file** (optional): Path to `.owl`, `.rdf`, or `.ttl` file +5. **Ontology name**: Short identifier (e.g., `HEALTHCARE`, `FINANCE`, `RETAIL`) + +Then ask the critical path decision: + +``` +ask_user_question: "Do you want a Knowledge Graph structure?" +Options: + - "Yes - KG path": Creates KG_NODE/KG_EDGE tables, enables graph analytics + - "No - Direct table path": Maps ontology directly to existing tables, no graph structure +``` + +**Step 1b: Discover Existing Semantic Views** + +After collecting the schema and source tables, check whether the user already has semantic views deployed on the source data: + +```sql +SHOW SEMANTIC VIEWS IN SCHEMA {DATABASE}.{SCHEMA}; +``` + +Also use the `semantic-view` skill (discover/describe mode) to understand what tables each existing semantic view covers: +``` +cortex semantic-views discover +cortex semantic-views describe {DATABASE}.{SCHEMA}.{SEMANTIC_VIEW_NAME} +``` + +Then prompt the user: + +``` +ask_user_question: "I found N existing semantic view(s) in {DATABASE}.{SCHEMA}: [list names]. + Do any of these already cover your source tables?" +Options: + - "Yes — use my existing semantic view as the base": Reuses the existing semantic view as the + base data layer tool in the final agent. No duplicate base semantic will be created. + - "No — create everything from scratch": A new base semantic view will be created over the + source tables (Phase 4.5) before building the ontology layer. +``` + +If no semantic views are found, skip the question and default to "create from scratch." + +If user selects "Yes," record: +- `existing_base_semantic`: The FQN of the existing semantic view (e.g., `DB.SCHEMA.MY_MODEL`) +- `existing_semantic_tables`: The list of tables covered by the existing semantic (from describe output) + +This information is used in: +- **Phase 2**: The existing semantic's curated column descriptions and relationships enrich ontology proposals +- **Phase 4.5**: Skipped entirely if an existing base semantic is available +- **Phase 6**: The existing semantic becomes the `base_query_tool` in the agent + +Store all inputs as session variables. If user provides only some inputs, ask for the rest. + +**Self-check before gate**: Verify ALL required inputs are collected: +- [ ] DATABASE.SCHEMA specified +- [ ] Source tables listed OR OWL file path provided +- [ ] At least 3 business questions captured +- [ ] Ontology name defined +- [ ] KG vs direct-table path chosen +- [ ] Checked for existing semantic views (`SHOW SEMANTIC VIEWS`) +- [ ] User confirmed starting point (existing semantic vs. tables only) + +If any are missing, ask the user for them. Do NOT present the gate with incomplete inputs. + +**⚠️ MANDATORY STOPPING POINT**: Present a summary of all collected inputs and the chosen path. You MUST use `ask_user_question`: + +``` +ask_user_question: "Here are the collected inputs. Is everything correct?" +Options: + - "Yes, proceed to Phase 2" + - "No, I need to change something" +``` + +### Phase 2: Analyze & Recommend Ontology + +**If OWL file provided:** + +Run the OWL parser: +```bash +uv run --project python /scripts/parse_owl.py \ + --owl-file "" \ + --output-dir "/tmp/ontology_parsed" +``` + +Then map parsed classes to existing tables using `DESCRIBE TABLE` on each source table. + +**If no OWL file (schema-first discovery):** + +Run schema introspection: +```bash +uv run --project python /scripts/introspect_schema.py \ + --database "" \ + --schema "" \ + --tables ",,..." \ + --questions "||..." \ + --output-dir "/tmp/ontology_parsed" +``` + +This script: +1. Calls `DESCRIBE TABLE` and `SHOW PRIMARY KEYS` for each table via Snowflake SQL +2. Analyzes foreign key relationships and column patterns +3. Proposes ontology classes (one per table or entity type) +4. Proposes relations from FK relationships and naming conventions +5. Outputs `classes.json`, `relations.json`, `mappings.json` + +**If existing semantic view available (from Step 1b):** + +When the user has an existing semantic view, use the `semantic-view` skill (describe mode) to extract its curated metadata and enrich the ontology proposals: + +``` +cortex semantic-views describe {DATABASE}.{SCHEMA}.{EXISTING_SEMANTIC_VIEW} +``` + +The existing semantic model provides: +- **Column descriptions** → Use as class property descriptions instead of auto-generated ones +- **Relationships** → Map directly to ontology relation proposals (validates FK-based proposals) +- **Dimensions/Facts/Metrics** → Inform which columns are important for each class +- **Table references** → Confirm which source tables are in scope + +After running schema introspection (or OWL parsing), merge the existing semantic metadata: +1. For each proposed class, check if the semantic model has a matching table — if so, import its column descriptions +2. For each semantic model relationship, verify a corresponding ontology relation exists — if not, add one +3. Use the semantic model's metric definitions to annotate key properties in `classes.json` + +This enrichment is additive — the introspection still runs, but produces better-documented results when a semantic model already describes the schema. + +**`classes.json` schema** — each entry MUST use these exact keys: +```json +[ + { + "name": "ClassName", + "label": "Display Name", + "description": "What this class represents", + "parent_name": null, + "is_abstract": false, + "is_deprecated": false, + "namespace": "DB.SCHEMA", + "uri": "urn:db:schema:ClassName", + "_source_table": "TABLE_NAME", + "_id_column": "ID_COL", + "_name_column": "NAME_COL" + } +] +``` + +**`relations.json` schema** — each entry MUST use these exact keys: +```json +[ + { + "name": "relation_name", + "label": "Relation Name", + "description": "What this relation represents", + "domain_class": "SourceClass", + "domain_classes": ["SourceClass"], + "range_class": "TargetClass", + "range_classes": ["TargetClass"], + "is_transitive": false, + "is_symmetric": false, + "is_functional": true, + "is_abstract": false, + "is_hierarchical": false, + "parent_name": null, + "inverse_name": null, + "cardinality": "N:1", + "uri": "urn:rel:relation_name", + "_source_table": "TABLE_NAME", + "_src_column": "SRC_COL", + "_dst_column": "DST_COL" + } +] +``` + +**IMPORTANT**: When editing `classes.json` or `relations.json` directly (e.g., adding a class via prompt), use the exact field names above. In particular: `is_abstract` (not `abstract`), `parent_name` (not `parent`), `is_deprecated` (not `deprecated`), `is_hierarchical` (not `hierarchical`). For abstract relations, set `is_abstract: true` and leave `_source_table`/`_src_column`/`_dst_column` as `null`. Concrete child relations reference their abstract parent via `parent_name`. + +Present the proposed ontology to the user as a table: + +| Class Name | Source Table | Type | ID Column | Name Column | Parent Class | +|------------|-------------|------|-----------|-------------|--------------| + +And proposed relations: + +| Relation | Domain Class | Range Class | Source | Cardinality | +|----------|-------------|-------------|--------|-------------| + +**Self-check before gate**: Verify the ontology design is complete: +- [ ] Every source table maps to at least one class +- [ ] Every class has a `_source_table`, `_id_column`, and `_name_column` (unless abstract) +- [ ] Every foreign key relationship has a corresponding relation entry +- [ ] No duplicate class names or relation names +- [ ] `classes.json` and `relations.json` use exact field names from the schema above +- [ ] At least one concrete class exists + +If any check fails, fix the issue before presenting to the user. Do NOT ask the user to approve an incomplete ontology. + +**⚠️ MANDATORY STOPPING POINT**: You MUST use `ask_user_question`: + +``` +ask_user_question: "Please review the proposed ontology above. Want to proceed, or make changes?" +Options: + - "Looks good, proceed to Phase 3" + - "I want to add/remove/change classes or relations" +``` + +### Phase 3: Visualize, Modify & Confirm + +Launch the visualization & editing app: +```bash +uv run --project python /scripts/visualize_ontology.py -- \ + --classes-json "/tmp/ontology_parsed/classes.json" \ + --relations-json "/tmp/ontology_parsed/relations.json" \ + --port 8501 +``` + +This shows: +- **Hierarchy** tab with two expandable sections: **Class Hierarchy** (interactive/text toggle, search, depth slider) and **Relation Hierarchy** (interactive/text toggle, abstract/concrete grouping) +- Ontology graph with coverage coloring and **Layer toggle** (All / Concrete / Abstract) +- Coverage tab showing the ontology design structure (implementation mapping becomes available after Phase 4 deployment) + +The **Layer toggle** on the Ontology Graph tab controls visibility: +- **All**: shows both abstract and concrete classes/relations (abstract items rendered distinctly) +- **Concrete**: hides abstract classes and abstract relations +- **Abstract**: shows only abstract classes and abstract relations (the schema blueprint) + +The sidebar **Ontology Editor** allows the user to modify the ontology before generation: +- **Add Class**: name, label, description, parent (dropdown), abstract toggle +- **Add Relation**: name, source/target class dropdowns, cardinality, description +- **Delete Class**: re-parents children to root, removes involved relations +- **Delete Relation**: removes the selected relation +- **Save Changes**: writes modified `classes.json` and `relations.json` back to disk +- **Undo All**: resets to the original loaded state + +When a node is clicked in the Ontology Graph tab, an **Edit Class** form appears below the +detail panel with label, description, parent, and abstract fields. + +**Visual diff markers** show what changed this session: +- Gold border (thick) = newly added class or relation +- Orange border (thick) = modified class +- Tooltips show NEW / MODIFIED tags + +**Edge styling** distinguishes relation types: +- Solid blue = subClassOf (class hierarchy) +- Dashed purple = concrete relation +- Solid deep purple (thick) = abstract relation +- Dotted light purple = specializes (child relation → parent relation) + +If Streamlit is not available or user prefers text, render an ASCII tree instead: +``` +Entity +├── Person +│ ├── Employee +│ └── Customer +├── Organization +│ ├── Department +│ └── Supplier +└── Event + ├── Order + └── Meeting +``` + +**Self-check before gate**: If user made edits in the visualizer, verify: +- [ ] Modified `classes.json` and `relations.json` were saved (check file timestamps or diff) +- [ ] Any added classes still follow the schema (all required fields present) +- [ ] Any added relations reference valid domain/range classes that exist in `classes.json` +- [ ] No orphaned relations pointing to deleted classes + +If the visualizer was used, re-read the JSON files to pick up user edits. + +**⚠️ MANDATORY STOPPING POINT**: You MUST use `ask_user_question`: + +``` +ask_user_question: "The visualizer is running. Please review the ontology structure. Ready to proceed to SQL generation (Phase 4)?" +Options: + - "Yes, proceed to Phase 4" + - "No, I need to make changes first" +``` + +### Phase 4: Generate & Deploy Ontology Layer (Layers 1-3) + +**If KG path:** +Generate `01_physical_layer.sql`: +- `CREATE TABLE KG_NODE` with NODE_ID, NODE_TYPE, NAME, PROPS (VARIANT), TS_INGESTED +- `CREATE TABLE KG_EDGE` with EDGE_ID, SRC_ID, DST_ID, EDGE_TYPE, WEIGHT, PROPS, EFFECTIVE_START/END +- Both clustered by type columns +- INSERT statements to load data from source tables into KG_NODE/KG_EDGE + +Generate `02_concrete_views.sql`: +- Per-class entity views: `V_{CLASS}` — typed projections from KG_NODE using `PROPS:field::TYPE` +- Per-relation relationship views: `V_{REL}` — typed projections from KG_EDGE with SRC_ID, DST_ID, WEIGHT, EFFECTIVE_START/END +- Direct-table path: thin SELECT wrappers over source tables + +**For both paths:** +Generate `03_metadata_tables.sql` (~22 tables): +- Core: ONT_ONTOLOGY, ONT_CLASS, ONT_RELATION_DEF, ONT_CLASS_MAP, ONT_REL_MAP +- Source mappings: ONT_OBJECT_SOURCE, ONT_LINK_SOURCE +- Properties: ONT_SHARED_PROPERTY, ONT_PROPERTY, ONT_DERIVED_PROPERTY +- Interfaces: ONT_INTERFACE, ONT_INTERFACE_PROPERTY, ONT_INTERFACE_IMPL +- Inference: ONT_RULE, REL_EDGE_INFERRED +- Data quality: ONT_CONSTRAINT_VIOLATION +- Actions: ACT_TYPE, ACT_DEF +- Functions: ONT_FUNCTION, ONT_FUNCTION_BINDING +- Views: OBJ_VIEW_DEF, OBJ_VIEW_FIELD +- RBAC: ONT_ROLE, ONT_ROLE_BINDING, ONT_PERMISSION +- Auto-populated INSERT statements derived from class/relation metadata + +Generate `04_abstract_views.sql`: +- Per-class views: `VW_ONT_{CLASS_NAME}` — UNION ALL of all source tables for that class +- Hierarchy views: `VW_ONT_SUBCLASS_OF`, `VW_DESCENDANTS`, `VW_ANCESTORS` +- Unified entity view: `VW_ONT_ALL_ENTITIES` +- Stats view: `VW_ONT_HIERARCHY_STATS` +- Resolved relationships view: `REL_RESOLVED` (joins edges with node names) + +Generate `05_view_generator_sp.sql`: +- `SP_GENERATE_ONTOLOGY_VIEWS()` stored procedure that reads ONT_CLASS_MAP and regenerates views dynamically + +**Optional — KG path only:** +Ask the user: +``` +ask_user_question: "Generate inference engine stored procedures?" +Options: + - "Yes": Generates SP_INFER_TRANSITIVE, SP_INFER_INVERSE, SP_RUN_ONTOLOGY_INFERENCE, + SP_CHECK_CARDINALITY_SINGLE, SP_CHECK_REFERENTIAL — enables rule-based inference and data quality checks + - "No": Skip inference engine (can be added later) +``` + +If yes, generate `06_inference_engine.sql`: +- `SP_INFER_TRANSITIVE(TARGET_REL, RULE_ID)` — recursive transitive closure (depth ≤ 5) +- `SP_INFER_INVERSE(RULE_ID)` — materialise inverse relationships from ONT_RELATION_DEF +- `SP_RUN_ONTOLOGY_INFERENCE()` — master runner for all enabled ONT_RULE entries +- `SP_CHECK_CARDINALITY_SINGLE(REL, CHECK_NAME)` — validate 1:1/N:1 constraints +- `SP_CHECK_REFERENTIAL(REL, CHECK_NAME)` — validate edge endpoint integrity +- Results written to `REL_EDGE_INFERRED` and `ONT_CONSTRAINT_VIOLATION` + +**KG path only**: Ask the user about SQL graph traversal tools: +``` +ask_user_question: "Generate SQL graph traversal tools for the agent?" +Options: + - "Yes": Generates 4 SQL UDF tools (EXPAND_DESCENDANTS_TOOL, GET_ANCESTORS_TOOL, + GET_HIERARCHY_PATH_TOOL, GET_DIRECT_CHILDREN_TOOL) — zero infrastructure, pure SQL + against KG_NODE/KG_EDGE. These are registered as generic tools in the Cortex Agent. + - "No": Skip graph traversal tools (agent uses only semantic view tools) +``` + +If yes, generate `07_graph_traversal_tools.sql`: +- `EXPAND_DESCENDANTS_TOOL(ROOT_CONCEPT)` — recursive downward traversal, returns all descendants with depth and path +- `GET_ANCESTORS_TOOL(CONCEPT)` — recursive upward traversal, returns all ancestors with shortest depth +- `GET_HIERARCHY_PATH_TOOL(START_CONCEPT, END_CONCEPT)` — finds path between two concepts via subClassOf edges +- `GET_DIRECT_CHILDREN_TOOL(PARENT_CONCEPT)` — single-hop children lookup + +These are complementary to SPCS graph tools (Phase 6b). SQL UDF tools handle hierarchy traversal with zero infrastructure. SPCS tools handle advanced graph algorithms (centrality, community detection) but require a container service. + +Use the generator script: +```bash +uv run --project python /scripts/generate_ontology_sql.py \ + --classes-json "/tmp/ontology_parsed/classes.json" \ + --relations-json "/tmp/ontology_parsed/relations.json" \ + --mappings-json "/tmp/ontology_parsed/mappings.json" \ + --database "" \ + --schema "" \ + --ontology-name "" \ + --kg-path \ + --include-inference \ + --include-graph-tools \ + --output-dir "/tmp/generated" +``` + +Present all generated SQL files to the user for review. + +**Step 4a-verify: Completeness check (MANDATORY before asking for approval)** + +Before presenting the gate, self-validate that ALL expected objects are present in the generated SQL: + +| File | Expected objects | +|------|-----------------| +| `01_physical_layer.sql` (KG only) | `KG_NODE`, `KG_EDGE` tables + INSERT statements | +| `02_concrete_views.sql` | One `V_{CLASS}` view per concrete class, one `V_{REL}` view per relation | +| `03_metadata_tables.sql` | All ~22 `ONT_*` tables + INSERT seed data | +| `04_abstract_views.sql` | One `VW_ONT_{CLASS}` per class, plus `VW_ONT_SUBCLASS_OF`, `VW_DESCENDANTS`, `VW_ANCESTORS`, `VW_ONT_ALL_ENTITIES`, `VW_ONT_HIERARCHY_STATS`, `REL_RESOLVED` | +| `05_view_generator_sp.sql` | `SP_GENERATE_ONTOLOGY_VIEWS` stored procedure | +| `06_inference_engine.sql` (if selected) | `SP_INFER_TRANSITIVE`, `SP_INFER_INVERSE`, `SP_RUN_ONTOLOGY_INFERENCE`, `SP_CHECK_CARDINALITY_SINGLE`, `SP_CHECK_REFERENTIAL` | +| `07_graph_traversal_tools.sql` (if selected) | `EXPAND_DESCENDANTS_TOOL`, `GET_ANCESTORS_TOOL`, `GET_HIERARCHY_PATH_TOOL`, `GET_DIRECT_CHILDREN_TOOL` | + +Cross-check: count the CREATE VIEW/TABLE/PROCEDURE statements in each file against the expected counts derived from `classes.json` and `relations.json`. If anything is missing, fix the generation before presenting to the user. Do NOT ask the user to approve incomplete SQL. + +**⚠️ MANDATORY STOPPING POINT — DO NOT EXECUTE SQL**: Present the completeness summary and all generated SQL files. You MUST use `ask_user_question` to get explicit approval before executing any SQL: + +``` +ask_user_question: "Phase 4 SQL generation complete. I've generated the SQL files listed above. Ready to deploy to Snowflake?" +Options: + - "Yes, deploy all SQL" + - "No, I need to review/change something first" +``` + +DO NOT execute any SQL statements, DO NOT proceed to Step 4b, until the user explicitly approves via the question above. + +**Step 4b: Deploy SQL artifacts (Layers 1-3)** + +After approval, execute all SQL scripts in order so that tables and views exist for Phases 5-6: + +1. **Physical layer** (if KG path): Execute `01_physical_layer.sql` +2. **Concrete views**: Execute `02_concrete_views.sql` (V_{CLASS} entity views, V_{REL} relationship views) +3. **Metadata tables**: Execute `03_metadata_tables.sql` (~22 tables + auto-populated seed data) +4. **Abstract views**: Execute `04_abstract_views.sql` +5. **View generator SP**: Execute `05_view_generator_sp.sql` +6. **Inference engine** (if selected): Execute `06_inference_engine.sql` +7. **SPCS graph service** (if selected): Execute `spcs_setup.sql` +8. **Graph traversal tools** (if selected): Execute `07_graph_traversal_tools.sql` + +Quick-validate that key objects exist: +```sql +SHOW VIEWS LIKE 'VW_ONT_%' IN SCHEMA {DATABASE}.{SCHEMA}; +SHOW TABLES LIKE 'ONT_%' IN SCHEMA {DATABASE}.{SCHEMA}; +-- If graph traversal tools selected: +SHOW USER FUNCTIONS LIKE '%_TOOL' IN SCHEMA {DATABASE}.{SCHEMA}; +``` + +Present a deployment summary to the user: +- Number of tables created (KG tables + ONT_* metadata tables) +- Number of views created (V_* concrete views + VW_ONT_* abstract views) +- Number of stored procedures created +- Number of UDFs created (if graph traversal tools selected) +- Any warnings or errors encountered during execution + +**⚠️ MANDATORY STOPPING POINT — DO NOT PROCEED TO PHASE 5** + +Before you can present the gate question, you MUST complete ALL of the following steps in order. Do NOT skip any step. Do NOT present the gate question until all 3 steps are done. + +**Required Step A: Generate deployed-objects manifest** + +Build a `deployed_objects.json` manifest from the SHOW commands output. The manifest maps each ontology concept to its generated Snowflake artifact(s), enabling the visualizer to show the "Original Design → Snowflake Implementation" mapping. + +Write the manifest to `/tmp/ontology_parsed/deployed_objects.json`: +```json +{ + "source": "", + "database": "{DATABASE}", + "schema": "{SCHEMA}", + "views": ["V_DOCTOR", "V_PATIENT", "VW_ONT_PERSON", "VW_ONT_ALL_ENTITIES", ...], + "tables": ["KG_NODE", "KG_EDGE", "ONT_CLASS_DEF", "ONT_RELATION_DEF", ...], + "procedures": ["SP_GENERATE_ONTOLOGY_VIEWS", ...], + "udfs": ["EXPAND_DESCENDANTS_TOOL", ...], + "class_to_objects": { + "Doctor": {"views": ["V_DOCTOR", "VW_ONT_DOCTOR"], "metadata_row": "ONT_CLASS_DEF"}, + "Patient": {"views": ["V_PATIENT", "VW_ONT_PATIENT"], "metadata_row": "ONT_CLASS_DEF"}, + "Person": {"views": ["VW_ONT_PERSON"], "metadata_row": "ONT_CLASS_DEF"} + }, + "relation_to_objects": { + "treats": {"view": "V_TREATS", "metadata_row": "ONT_RELATION_DEF"}, + "subClassOf": {"view": "VW_ONT_SUBCLASS_OF", "metadata_row": "ONT_RELATION_DEF"} + } +} +``` + +Build the `class_to_objects` mapping by iterating `classes.json`: +- For each concrete class: look for `V_{CLASS_NAME}` in the views list → add to views array +- For each class (abstract or concrete): look for `VW_ONT_{CLASS_NAME}` in the views list → add to views array +- Every class gets `"metadata_row": "ONT_CLASS_DEF"` since all classes are seeded into that table + +Build the `relation_to_objects` mapping by iterating `relations.json`: +- For each relation: look for `V_{RELATION_NAME}` in the views list → set as view +- Every relation gets `"metadata_row": "ONT_RELATION_DEF"` + +**Required Step B: Re-launch the visualizer with coverage mapping** + +Stop the Phase 3 visualizer if still running, then re-launch with the `--deployed-objects` flag: +```bash +uv run --project python /scripts/visualize_ontology.py -- \ + --classes-json "/tmp/ontology_parsed/classes.json" \ + --relations-json "/tmp/ontology_parsed/relations.json" \ + --deployed-objects "/tmp/ontology_parsed/deployed_objects.json" \ + --port 8501 +``` + +Without re-launching, the graph will show all nodes as unmapped (red). After re-launch with `--deployed-objects`, nodes will be colored by implementation status: +- **Coverage tab**: "Original Design → Snowflake Implementation" with progress bar, 3-column breakdown (Mapped / Covered by Ancestor / Unmapped), and full artifact inventory +- **Graph tab**: Nodes colored green (mapped), blue (covered by ancestor), red (unmapped), gray (abstract) +- **Node detail**: Click any node to see its deployed Snowflake objects + +**Required Step C: Verify the visualizer shows coverage (NOT all red/unmapped)** + +Open the visualizer and confirm that concrete classes show as green (mapped) or blue (covered), NOT red (unmapped). If everything is still red, the manifest was not loaded — check the `--deployed-objects` path and re-launch. + +**Only after Steps A, B, C are done**, present the gate question using `ask_user_question`: + +``` +ask_user_question: "Phase 4 deployment complete. All Layer 1-3 objects are deployed. The visualizer is showing the coverage mapping. Ready to proceed to Phase 4.5 (Base Semantic View)?" +Options: + - "Yes, proceed to Phase 4.5" + - "No, I need to fix something first" +``` + +DO NOT invoke the `semantic-view` skill, DO NOT start Phase 4.5 or Phase 5, and DO NOT continue until the user explicitly confirms via the question above. + +### Phase 4.5: Ensure Base Semantic View — via `semantic-view` skill + +The final delivery always includes a **base semantic view** that covers the source tables directly (for concrete data queries). This phase ensures one exists — either by reusing an existing semantic view or creating a new one. + +**If existing semantic view selected in Phase 1 (Step 1b):** + +Skip this phase entirely — including the Phase 4.5 gate below. The user's existing semantic view becomes the base. Record its FQN: +``` +base_semantic_view = "{DATABASE}.{SCHEMA}.{EXISTING_SEMANTIC_VIEW_NAME}" +``` + +Proceed directly to Phase 5 (do NOT present the Phase 4.5 gate question). + +**If no existing semantic view (create from scratch):** + +Invoke the native **`semantic-view` skill** (creation mode) to build a base semantic view over the **original source tables** (not the VW_ONT_* views — those are covered in Phase 5): + +``` +skill: semantic-view + +Context to provide when the skill asks: + - Semantic view name: {ONTOLOGY_NAME}_BASE + - Target database: {DATABASE} + - Target schema: {SCHEMA} + - Table references: The original source tables from Phase 1 + (e.g., COMPANIES_ENRICHED, SECURITIES, EXCHANGES, SECTORS, etc.) + - SQL queries: Convert the business questions from Phase 1 into SELECT statements + against the source tables (these become VQRs in the semantic model) + - Business context: "Base data layer semantic view for direct queries against source tables. + Use for specific entity lookups, aggregations, filtering by attributes, and concrete + data questions that don't require ontology reasoning." +``` + +The `semantic-view` skill will: +1. Call FastGen to auto-generate the semantic model YAML from the source table metadata +2. Validate the YAML via `SYSTEM$CREATE_SEMANTIC_VIEW_FROM_YAML(..., TRUE)` +3. Present the generated model for review +4. Deploy the semantic view + +**Step 4.5a: Test the base semantic view** + +Invoke the `semantic-view` skill in **audit** mode to validate: +``` +skill: semantic-view + +Context: Audit the semantic view {DATABASE}.{SCHEMA}.{ONTOLOGY_NAME}_BASE + - Run 2-3 of the business questions from Phase 1 against the base semantic view + - Verify the generated SQL executes without errors + - Verify the results contain expected data (non-empty, reasonable values) + - If any query fails, use the skill's refine workflow to fix the semantic model +``` + +Record the base semantic view FQN: +``` +base_semantic_view = "{DATABASE}.{SCHEMA}.{ONTOLOGY_NAME}_BASE" +``` + +**Step 4.5b: Record describe metadata for Phase 6** + +Run `cortex semantic-views describe` on the base semantic view and record the output. This metadata feeds into Phase 6's tool descriptions for the agent: + +``` +cortex semantic-views describe {DATABASE}.{SCHEMA}.{ONTOLOGY_NAME}_BASE +``` + +Record: description, table count, dimension/fact/metric counts, and key dimension names. This enables Phase 6 to generate rich, data-aware tool descriptions without relying on session memory. + +**Self-check before gate**: Verify the base semantic view is ready: +- [ ] Base semantic view exists: `SHOW SEMANTIC VIEWS LIKE '{ONTOLOGY_NAME}_BASE' IN SCHEMA {DATABASE}.{SCHEMA}` + (or existing semantic view confirmed in Phase 1) +- [ ] At least 2 business questions returned non-empty results via audit +- [ ] `base_semantic_view` FQN is recorded for use in Phase 6 + +**⚠️ MANDATORY STOPPING POINT — DO NOT PROCEED TO PHASE 5**: When the `semantic-view` skill finishes (or the existing semantic is confirmed), you MUST STOP HERE. Do NOT begin Phase 5, do NOT describe VW_ONT_* views, do NOT start building ontology-layer semantic views. You MUST use `ask_user_question`: + +``` +ask_user_question: "Base semantic view is ready ({base_semantic_view}). Ready to proceed to Phase 5 (Ontology Semantic Views)?" +Options: + - "Yes, proceed to Phase 5" + - "No, I need to fix the base semantic view" +``` + +**STOP. Wait for the user's answer. Do not read ahead. Do not begin Phase 5 until the user selects an option above.** + +### Phase 5: Create Ontology Semantic Views (Layer 4) — via `semantic-view` skill + +> **GATE CHECK**: You should only be here if the user explicitly approved the Phase 4.5 gate above. If you arrived here without asking the user, STOP and go back to present the Phase 4.5 gate question. + +> **Note**: The **base semantic view** (covering source tables directly) was already created or identified in Phase 4.5. This phase creates **ontology-layer** semantic views over the objects deployed in Phase 4 (VW_ONT_*, V_*, ONT_* tables). + +Ask the user which **ontology-layer** semantic views to create: + +``` +ask_user_question: "Which ontology-layer semantic views should I create? (The base semantic view over source tables is already ready from Phase 4.5.)" +multiSelect: true +Options: + - "KG Semantic View": Concrete entity views (V_{CLASS}) and relationship views (V_{REL}) for fast direct queries + [Only available if KG path chosen] + - "Ontology Semantic View": Abstract views (VW_ONT_*) for cross-type reasoning + - "Metadata & Governance Semantic View": All ~22 ONT_* introspection tables for governance and data quality +``` + +**For each selected semantic view**, invoke the native **`semantic-view` skill** (creation mode) to generate, validate, and deploy the semantic view. The skill uses Snowflake's FastGen system function which auto-discovers columns, infers primary keys, generates dimensions/measures/metrics/relationships, and creates verified queries. + +**Step 5a: Determine base tables for each model** + +Gather the base tables from the ontology objects deployed in Phase 4: + +- **KG Semantic View**: List all `V_{CLASS}` entity views and `V_{REL}` relationship views: + ```sql + SHOW VIEWS LIKE 'V_%' IN SCHEMA {DATABASE}.{SCHEMA}; + ``` +- **Ontology Semantic View**: List all `VW_ONT_%` abstract views: + ```sql + SHOW VIEWS LIKE 'VW_ONT_%' IN SCHEMA {DATABASE}.{SCHEMA}; + ``` +- **Metadata Semantic View**: List all `ONT_%` metadata tables: + ```sql + SHOW TABLES LIKE 'ONT_%' IN SCHEMA {DATABASE}.{SCHEMA}; + ``` + +**Step 5b: Invoke the `semantic-view` skill for each selected model** + +For each selected model, invoke the skill with this context: + +``` +skill: semantic-view + +Context to provide when the skill asks: + - Semantic view name: {ONTOLOGY_NAME}_KG_MODEL (or _ONTOLOGY_MODEL or _METADATA_MODEL) + - Target database: {DATABASE} + - Target schema: {SCHEMA} + - Table references: The base tables gathered in Step 5a + - SQL queries: Convert the business questions from Phase 1 into SELECT statements + against the base tables (these become VQRs in the semantic model) + - Business context: Describe the model's purpose and intent-routing role +``` + +The `semantic-view` skill will: +1. Call FastGen to auto-generate the semantic model YAML from the actual Snowflake metadata +2. Validate the YAML via `SYSTEM$CREATE_SEMANTIC_VIEW_FROM_YAML(..., TRUE)` +3. Present the generated model for review +4. Offer options: audit, deploy, or refine + +**Step 5c: Deploy each semantic view** + +Use the `semantic-view` skill's upload workflow to deploy each approved semantic view to Snowflake: +```sql +-- The skill handles this via upload_semantic_view_yaml.py, which runs: +-- CALL SYSTEM$CREATE_SEMANTIC_VIEW_FROM_YAML('{DATABASE}.{SCHEMA}', $${yaml}$$, FALSE); +``` + +Verify deployment: +```sql +SHOW SEMANTIC VIEWS IN SCHEMA {DATABASE}.{SCHEMA}; +``` + +**Step 5d: Test each semantic view** + +For each deployed semantic view, invoke the `semantic-view` skill in **audit/test** mode to validate it returns correct results: + +``` +skill: semantic-view + +Context: Audit the semantic view {DATABASE}.{SCHEMA}.{SEMANTIC_VIEW_NAME} + - Run 2-3 of the business questions from Phase 1 against each semantic view + - Verify the generated SQL executes without errors + - Verify the results contain expected data (non-empty, reasonable values) + - If any query fails, use the skill's refine workflow to fix the semantic model +``` + +The `semantic-view` skill will test queries via Cortex Analyst, show generated SQL and results, and flag any issues. + +**Step 5e: Record describe metadata for Phase 6** + +For EACH deployed ontology-layer semantic view, run `cortex semantic-views describe` and record the output. This metadata feeds into Phase 6's tool descriptions for the agent: + +``` +FOR EACH deployed semantic view from Step 5b: + cortex semantic-views describe {DATABASE}.{SCHEMA}.{SEMANTIC_VIEW_NAME} +``` + +Record for each: description, table count, dimension/fact/metric counts, and key dimension names. This enables Phase 6 to generate rich, data-aware tool descriptions without relying on session memory of what was created in this phase. + +**Self-check before gate**: Verify all semantic views are complete: +- [ ] `SHOW SEMANTIC VIEWS IN SCHEMA {DATABASE}.{SCHEMA}` returns one row per selected model +- [ ] Each semantic view name matches expected naming ({ONTOLOGY_NAME}_KG_MODEL, _ONTOLOGY_MODEL, _METADATA_MODEL) +- [ ] Step 5d tests passed — at least 2 business questions per view returned non-empty results +- [ ] No unresolved validation errors from the `semantic-view` skill + +If any semantic view is missing or tests failed, fix before presenting the gate. + +**⚠️ MANDATORY STOPPING POINT — DO NOT PROCEED TO PHASE 6**: When the last `semantic-view` skill invocation finishes, you MUST STOP HERE. Do NOT invoke the `cortex-agent` skill, do NOT begin agent creation, do NOT start discovering deployed assets. You MUST use `ask_user_question`: + +``` +ask_user_question: "All semantic views are deployed. Please review and test them. Ready to proceed to Phase 6 (Agent creation)?" +Options: + - "Yes, proceed to Phase 6" + - "No, I need to fix a semantic view" +``` + +**STOP. Wait for the user's answer. Do not read ahead. Do not begin Phase 6 until the user selects an option above.** + +### Phase 6: Create Cortex Agent (Layer 5) — via `cortex-agent` skill + +> **GATE CHECK**: You should only be here if the user explicitly approved the Phase 5 gate above. If you arrived here without asking the user, STOP and go back to present the Phase 5 gate question. + +**Step 6-pre: Discover deployed assets** + +Before building anything, dynamically discover all assets from prior phases using Snowflake discovery commands. This makes Phase 6 resilient to session resume — it works even if earlier phases ran in a different session. + +**1. Discover semantic views:** + +``` +cortex semantic-views list --like '%' --in database {DATABASE} schema {SCHEMA} +``` + +For EACH semantic view found, run: +``` +cortex semantic-views describe {DATABASE}.{SCHEMA}.{SEMANTIC_VIEW_NAME} +``` + +Record for each: name, FQN, description, table references, dimension/fact/metric counts, and key dimension names. + +**2. Classify each semantic view by role** using the `describe` output: + +- **Base**: covers source tables from Phase 1 (table names are original source tables, not VW_ONT_* or V_* views) +- **KG**: covers `V_*` typed views and/or `KG_NODE`/`KG_EDGE` +- **Ontology**: covers `VW_ONT_*` abstract views +- **Metadata**: covers `ONT_*` introspection tables + +Use the table references from the `describe` output to classify. If ambiguous, check the semantic view name for hints (`_BASE`, `_KG_MODEL`, `_ONTOLOGY_MODEL`, `_METADATA_MODEL`). + +**3. Discover graph UDFs (if KG path):** + +```sql +SHOW USER FUNCTIONS LIKE '%_TOOL' IN SCHEMA {DATABASE}.{SCHEMA}; +``` + +For EACH function found, run: +```sql +DESCRIBE FUNCTION {DATABASE}.{SCHEMA}.{FUNCTION_NAME}(VARCHAR); +``` + +Record: name, FQN, parameters, return type. + +**4. Discover SPCS service functions (if any):** + +```sql +SHOW FUNCTIONS LIKE '%GRAPH_%' IN SCHEMA {DATABASE}.{SCHEMA}; +``` + +**Step 6a: Build the tool inventory from discovered assets** + +Build `TOOL_INVENTORY` from the discovery results — NOT from memory of earlier phases: + +``` +TOOL_INVENTORY = [] +``` + +For each classified semantic view: +- Base semantic view → add `base_query_tool` (type: `cortex_analyst_text_to_sql`, FQN from discovery) +- KG semantic view → add `kg_query_tool` (type: `cortex_analyst_text_to_sql`, FQN from discovery) +- Ontology semantic view → add `ontology_query_tool` (type: `cortex_analyst_text_to_sql`, FQN from discovery) +- Metadata semantic view → add `metadata_query_tool` (type: `cortex_analyst_text_to_sql`, FQN from discovery) + +For each discovered graph UDF: +- `EXPAND_DESCENDANTS_TOOL` → add `expand_descendants_tool` (type: `generic`, FQN from discovery) +- `GET_ANCESTORS_TOOL` → add `get_ancestors_tool` +- `GET_HIERARCHY_PATH_TOOL` → add `get_hierarchy_path_tool` +- `GET_DIRECT_CHILDREN_TOOL` → add `get_direct_children_tool` + +For each SPCS service function → add to TOOL_INVENTORY with type `generic`. + +Only add tools for assets that were actually found by discovery. Do NOT add tools based on session memory or assumptions about what should exist. + +**Cross-check:** Verify every discovered semantic view maps to exactly one tool. Flag any unclassified semantic views or missing expected assets. + +**Record the final inventory with discovery metadata:** +``` +TOOL_INVENTORY = [ + { name: "base_query_tool", type: "cortex_analyst_text_to_sql", + source_fqn: "{FQN}", role: "base", + describe_metadata: { tables: N, dimensions: N, facts: N, metrics: N, + key_dimensions: [...], description: "..." } }, + ... +] +Total tools: {N} +``` + +Present this to the user before proceeding: *"I discovered {N} deployable assets. The agent will have {N} tools: {list}. Correct?"* + +**Step 6b: Check for graph analytics tools (KG path only)** + +If KG path, check what graph tools were deployed in Phase 4: + +- **SQL UDF graph traversal tools**: If `07_graph_traversal_tools.sql` was generated and deployed, the 4 UDFs (`EXPAND_DESCENDANTS_TOOL`, `GET_ANCESTORS_TOOL`, `GET_HIERARCHY_PATH_TOOL`, `GET_DIRECT_CHILDREN_TOOL`) are already available and should be registered as `generic` tools in the agent. +- **SPCS graph analytics tools**: If user also wants advanced graph algorithms (centrality, community detection), offer SPCS: + +``` +ask_user_question: "Include SPCS graph analytics tools in the agent? (SQL hierarchy tools are already deployed)" +Options: + - "Yes": Adds centrality, community detection, shortest path tools (requires SPCS container service) + - "No": Agent uses semantic view tools + SQL graph traversal tools only +``` + +If SPCS selected, generate SPCS scaffolding only (no agent spec): +```bash +uv run --project python /scripts/generate_spcs_scaffolding.py \ + --database "" \ + --schema "" \ + --output-dir "/tmp/generated" +``` +This produces: +- `spcs_graph_service.py` — NetworkX-based graph analytics service +- `spcs_setup.sql` — SPCS service creation, compute pool, service functions + +Guide the user through deploying the SPCS service before proceeding: +```sql +-- Execute spcs_setup.sql to create the compute pool, image repository, +-- service, and service functions in {DATABASE}.{SCHEMA} +``` + +**Step 6c: Invoke the `cortex-agent` skill to create the agent** + +Invoke the native **`cortex-agent` skill** in **create** mode. Provide the discovered tool inventory and domain context — let the skill generate the orchestration instructions, tool descriptions, and agent spec. Do NOT hand-write the orchestration prompt. + +``` +skill: cortex-agent + +Context to provide when the skill asks: + - Agent name: {ONTOLOGY_NAME}_AGENT + - Target database: {DATABASE} + - Target schema: {SCHEMA} + - Domain: {ONTOLOGY_NAME} — {1-sentence domain description from Phase 1} + + - Tool inventory with discovery metadata (from Step 6-pre/6a): + + FOR EACH tool in TOOL_INVENTORY, provide: + + Tool #{N}: {tool.name} + Type: {tool.type} (cortex_analyst_text_to_sql | generic) + Source: {tool.source_fqn} + Role: {tool.role} (base | kg | ontology | metadata | graph_traversal) + Discovery metadata: + - For semantic view tools: "{tool.describe_metadata.description}. + Covers {tool.describe_metadata.tables} tables with + {tool.describe_metadata.dimensions} dimensions, + {tool.describe_metadata.facts} facts, + {tool.describe_metadata.metrics} metrics. + Key dimensions: {tool.describe_metadata.key_dimensions}." + - For graph UDF tools: "Parameters: {tool.parameters}. + Returns: {tool.return_type}." + Sample questions this tool should answer (from Phase 1 business questions): + - "{question mapped to this tool's role}" + - "{another question mapped to this tool's role}" + + - Routing principles (high-level guidance for the skill to use when generating orchestration): + * base tools: concrete source data — entity lookups, aggregations, filtering + * ontology tools: cross-type reasoning via abstract views + * kg tools: KG entity/relationship queries via typed views + * metadata tools: governance, structure introspection, data quality + * graph tools: hierarchy traversal, path finding, ancestor/descendant exploration + (include only lines for tool roles present in TOOL_INVENTORY) + + - The `cortex-agent` skill should generate the full orchestration instructions, + tool descriptions (with "When to Use" / "When NOT to Use" routing hints), + and the complete agent spec including tool_resources. + + - For graph traversal UDF tools, note these are SQL UDFs (not stored procedures), + so tool_resources should use "type": "function". +``` + +The `cortex-agent` skill will: +1. Verify admin setup (warehouse, role, permissions) +2. Use the tool inventory and discovery metadata to generate rich tool descriptions +3. Generate orchestration instructions with proper routing between all tools +4. Build the agent spec with all tools, tool_resources, and orchestration +5. Execute `CREATE OR REPLACE AGENT` via `$spec$` DDL +6. Run verification queries to confirm the agent is functional + +**After the skill creates the agent, REVIEW the orchestration:** +- [ ] Every tool in TOOL_INVENTORY has a routing block in the orchestration +- [ ] No routing block references a tool NOT in TOOL_INVENTORY +- [ ] Sample questions from Phase 1 would route correctly based on the generated rules +- [ ] "When NOT to Use" in each tool description only references other tools that exist +- [ ] If the agent has >2 tools, the orchestration includes multi-tool combination patterns + +If any issue found, ask the `cortex-agent` skill to **refine** — do NOT hand-edit the orchestration yourself. + +**Step 6d: Add SPCS graph tools to the agent (if selected in 6b)** + +If SPCS graph tools were selected in Step 6b, invoke the `cortex-agent` skill again in **edit** mode to add the SPCS service functions as stored-procedure tools: + +``` +skill: cortex-agent + +Context: Edit agent {DATABASE}.{SCHEMA}.{ONTOLOGY_NAME}_AGENT + - Add tools: Each SPCS service function as a stored-procedure tool + (e.g., graph_centrality, graph_community_detection, graph_shortest_path) + - Update orchestration instructions to include graph analytics routing +``` + +**Step 6e: Verify and test the agent** + +**1. Describe the created agent via discovery:** + +``` +cortex agents describe {DATABASE}.{SCHEMA}.{ONTOLOGY_NAME}_AGENT +``` + +**2. Cross-check against TOOL_INVENTORY:** +- [ ] Agent tool count = `len(TOOL_INVENTORY)` +- [ ] Each tool name in TOOL_INVENTORY is registered in the agent (no missing, no extras) +- [ ] Each semantic view FQN in the agent matches the discovered FQN from Step 6-pre +- [ ] Each graph UDF function identifier matches the discovered FQN +- [ ] Orchestration mentions every tool by name with routing guidance + +**3. Test with Phase 1 business questions via `cortex-agent` skill:** + +Invoke the `cortex-agent` skill in **test/debug** mode. Map each business question from Phase 1 to its expected tool based on role: + +``` +skill: cortex-agent + +Context: Test the agent {DATABASE}.{SCHEMA}.{ONTOLOGY_NAME}_AGENT + - The agent has {len(TOOL_INVENTORY)} tools: {list from TOOL_INVENTORY} + - Run one sample question per tool, using Phase 1 business questions: + Map each question to its expected tool by role: + - Source data questions → base_query_tool + - Cross-type reasoning questions → ontology_query_tool (if in inventory) + - KG entity/relationship questions → kg_query_tool (if in inventory) + - Governance/coverage questions → metadata_query_tool (if in inventory) + - Hierarchy questions → graph tools (if in inventory) + - Verify each question routes to the CORRECT tool + - Verify responses contain expected data (non-empty, reasonable values) +``` + +**4. If routing is wrong:** Ask the `cortex-agent` skill to **refine** the orchestration — do NOT hand-edit the orchestration prompt yourself. + +**Self-check before gate**: Verify the agent matches TOOL_INVENTORY: +- [ ] `cortex agents describe` output confirms agent exists with correct tool count +- [ ] Each tool name in TOOL_INVENTORY appears in the agent description +- [ ] Step 6e tests passed — each tool received at least one correctly-routed question with non-empty response +- [ ] No unresolved errors from the `cortex-agent` skill + +If agent is missing, has wrong tool count, or tests failed, fix before presenting the gate. + +**⚠️ MANDATORY STOPPING POINT — DO NOT PROCEED TO PHASE 7**: When the `cortex-agent` skill finishes, you MUST STOP HERE. Do NOT begin validation, do NOT run validation queries. You MUST use `ask_user_question`: + +``` +ask_user_question: "The Cortex Agent is deployed. Please test it with the questions above. Ready to proceed to Phase 7 (Validation)?" +Options: + - "Yes, proceed to Phase 7" + - "No, I need to fix the agent" +``` + +**STOP. Wait for the user's answer. Do not read ahead. Do not begin Phase 7 until the user selects an option above.** + +### Phase 7: End-to-End Validation + +> **GATE CHECK**: You should only be here if the user explicitly approved the Phase 6 gate above. If you arrived here without asking the user, STOP and go back to present the Phase 6 gate question. + +> **Note**: SQL artifacts (L1-L3) were deployed in Phase 4b. Base semantic view (L4-base) was deployed in Phase 4.5. Ontology-layer semantic views (L4-ontology) and Cortex Agent (L5) were deployed by native skills in Phases 5 and 6. This phase validates the full stack. + +**Step 7a: Validate all layers** + +Run validation queries across the full stack: + +- **L1 Physical**: Row counts for KG_NODE and KG_EDGE (if KG path) +- **L2 Metadata**: Row counts for all ~22 ONT_* tables +- **L3 Abstract Views**: Sample queries against each VW_ONT_* view +- **L4 Semantic Views**: Verify via `SHOW SEMANTIC VIEWS IN SCHEMA {DATABASE}.{SCHEMA}` — must include base semantic (Phase 4.5) + ontology-layer semantics (Phase 5) +- **L5 Cortex Agent**: Run one end-to-end question through the agent: + ```sql + SELECT SNOWFLAKE.CORTEX.AGENT( + '{DATABASE}.{SCHEMA}.{ONTOLOGY_NAME}_AGENT', + PARSE_JSON('{"messages": [{"role": "user", "content": ""}]}') + ); + ``` + +Present a summary table of all deployed objects with row counts and status. + +**Self-check before gate**: Verify the full stack is healthy: +- [ ] L1: KG_NODE and KG_EDGE have non-zero row counts (if KG path) +- [ ] L2: All ~22 ONT_* tables exist and have seed data (row count > 0 for at least ONT_CLASS, ONT_RELATION_DEF, ONT_CLASS_MAP) +- [ ] L3: Every VW_ONT_* view returns rows without errors +- [ ] L4: `SHOW SEMANTIC VIEWS` count matches expected: 1 base (Phase 4.5) + N ontology-layer (Phase 5 selections) +- [ ] L5: Agent query returned a meaningful response (not an error or empty) + +If any layer fails validation, report the specific failure and attempt to fix (re-execute SQL, re-deploy view, etc.) before presenting the gate. Do NOT present a "deployment complete" gate with known failures. + +**⚠️ MANDATORY STOPPING POINT — FINAL GATE**: You MUST use `ask_user_question`: + +``` +ask_user_question: "End-to-end validation complete. All layers are deployed and verified. Is the deployment complete?" +Options: + - "Yes, deployment is complete" + - "No, there are issues to address" +``` + +## Degrees of Freedom + +| Decision | Freedom | Default | +|----------|---------|---------| +| KG vs direct-table path | **High** | Ask in Phase 1 | +| Use existing semantic vs create new base | **High** | Ask in Phase 1 (Step 1b) | +| Which ontology-layer semantic models (1-3) | **High** | All 3 if KG, 2 if direct | +| Graph analytics tools | **High** | Only offered if KG path | +| Ontology class/relation design | **Medium** | AI recommends, user confirms | +| View naming convention | **Low** | `VW_ONT_` prefix | +| Agent routing instructions | **Low** | Auto-generated from model descriptions | + +## Error Handling + +- **No tables found**: Verify DATABASE.SCHEMA exists and user has SELECT permission +- **OWL parse failure**: Check file format (OWL/XML, Turtle, RDF/XML). Parser auto-detects. +- **SQL execution error**: Show exact error, suggest GRANT statements if permission issue +- **Empty views**: Verify filter conditions in ONT_CLASS_MAP match actual data values +- **Semantic model validation failure**: Check that all referenced views exist and have data + +## Session Resume + +If invoked mid-session, use **discovery commands** to detect the current state from Snowflake — do not rely on session memory or local files alone. + +**Discovery-first state detection:** + +1. Check local artifacts: `/tmp/generated/`, `/tmp/ontology_parsed/` +2. Check Snowflake objects: + ``` + cortex semantic-views list --like '%' --in database {DATABASE} schema {SCHEMA} + cortex agents list --like '%' --in database {DATABASE} schema {SCHEMA} + ``` + ```sql + SHOW VIEWS LIKE 'VW_ONT_%' IN SCHEMA {DATABASE}.{SCHEMA}; + SHOW VIEWS LIKE 'V_%' IN SCHEMA {DATABASE}.{SCHEMA}; + SHOW TABLES LIKE 'ONT_%' IN SCHEMA {DATABASE}.{SCHEMA}; + SHOW USER FUNCTIONS LIKE '%_TOOL' IN SCHEMA {DATABASE}.{SCHEMA}; + ``` + +**Resume decision tree:** +- If `classes.json` exists → resume from Phase 3 (visualize) +- If `*.sql` files exist but not yet executed → resume from Phase 4b (deploy SQL) +- If `VW_ONT_*` views exist but no base semantic view → resume from Phase 4.5 (base semantic via `semantic-view` skill) +- If base semantic view exists but no ontology-layer semantic views → resume from Phase 5 (ontology semantic views via `semantic-view` skill) +- If ontology-layer semantic views exist but no agent → resume from Phase 6 (agent via `cortex-agent` skill) + - Phase 6 will run its own discovery (Step 6-pre) to build TOOL_INVENTORY from Snowflake, so no prior session context is needed +- If agent exists → resume from Phase 7 (validation) + +To classify semantic views, run `cortex semantic-views describe {FQN}` for each and check the table references: +- Covers source tables (not VW_ONT_* or V_*) → base semantic view +- Covers V_* views → KG semantic view +- Covers VW_ONT_* views → ontology semantic view +- Covers ONT_* tables → metadata semantic view + +Always confirm the detected state with the user before proceeding. + +## Output Files + +SQL artifacts are saved to `/tmp/generated/` during the session. Semantic views and the Cortex Agent are deployed directly to Snowflake by the native skills (no local YAML or JSON files). + +``` +/tmp/generated/ +├── 01_physical_layer.sql (KG path only) +├── 02_concrete_views.sql (V_{CLASS}, V_{REL} typed views) +├── 03_metadata_tables.sql (~22 tables + auto-populated seed data) +├── 04_abstract_views.sql +├── 05_view_generator_sp.sql +├── 06_inference_engine.sql (optional, KG path + user opted in) +├── 07_graph_traversal_tools.sql (optional, KG path + user opted in — 4 SQL UDF tools) +├── spcs_graph_service.py (if SPCS graph tools selected) +└── spcs_setup.sql (if SPCS graph tools selected) + +Deployed directly to Snowflake (via native skills): +├── Base semantic view (created by `semantic-view` skill in Phase 4.5, or existing) +├── Ontology semantic views (created by `semantic-view` skill in Phase 5) +└── Cortex Agent (created by `cortex-agent` skill in Phase 6) +``` diff --git a/skills/ontology-stack-builder/assets/ontology-graph.png b/skills/ontology-stack-builder/assets/ontology-graph.png new file mode 100644 index 00000000..6533264b Binary files /dev/null and b/skills/ontology-stack-builder/assets/ontology-graph.png differ diff --git a/skills/ontology-stack-builder/pyproject.toml b/skills/ontology-stack-builder/pyproject.toml new file mode 100644 index 00000000..4d184397 --- /dev/null +++ b/skills/ontology-stack-builder/pyproject.toml @@ -0,0 +1,14 @@ +[project] +name = "ontology-stack-builder" +version = "1.0.0" +description = "Cortex Code skill: generate full Ontology-on-Snowflake stack from any relational schema" +requires-python = ">=3.10" +dependencies = [ + "rdflib>=7.0.0", + "pyyaml>=6.0", + "streamlit>=1.30.0", + "streamlit-agraph>=0.0.45", +] + +[project.optional-dependencies] +dev = ["pytest", "ruff"] diff --git a/skills/ontology-stack-builder/references/abstract_views_template.sql b/skills/ontology-stack-builder/references/abstract_views_template.sql new file mode 100644 index 00000000..ecef06ef --- /dev/null +++ b/skills/ontology-stack-builder/references/abstract_views_template.sql @@ -0,0 +1,52 @@ +-- ============================================================================ +-- Abstract Ontology Views Template +-- Replace {DATABASE}, {SCHEMA} with actual values +-- ============================================================================ + +USE SCHEMA {DATABASE}.{SCHEMA}; + +-- Per-class entity view pattern (one per concrete class): +-- CREATE OR REPLACE VIEW VW_ONT_{CLASS_NAME} AS +-- SELECT +-- {ID_EXPR} AS ENTITY_ID, +-- '{CLASS_NAME}' AS ENTITY_TYPE, +-- {NAME_EXPR} AS ENTITY_NAME, +-- {SUBTYPE_EXPR} AS SUBTYPE, +-- {PROPS_EXPR} AS PROPS +-- FROM {SOURCE_TABLE} +-- WHERE {FILTER_COL} = '{FILTER_VAL}'; -- omit WHERE if no filter + +-- Unified entity view (UNION ALL of all concrete class views): +-- CREATE OR REPLACE VIEW VW_ONT_ALL_ENTITIES AS +-- SELECT ENTITY_ID, ENTITY_TYPE, ENTITY_NAME, SUBTYPE, PROPS FROM VW_ONT_{CLASS1} +-- UNION ALL +-- SELECT ENTITY_ID, ENTITY_TYPE, ENTITY_NAME, SUBTYPE, PROPS FROM VW_ONT_{CLASS2} +-- ...; + +-- Resolved relationships view: +-- CREATE OR REPLACE VIEW REL_RESOLVED AS +-- SELECT +-- r.REL_NAME, +-- r.SRC_ID_EXPR AS SRC_ID, +-- src.ENTITY_NAME AS SRC_NAME, +-- src.ENTITY_TYPE AS SRC_TYPE, +-- r.DST_ID_EXPR AS DST_ID, +-- dst.ENTITY_NAME AS DST_NAME, +-- dst.ENTITY_TYPE AS DST_TYPE, +-- r.WEIGHT_EXPR AS WEIGHT +-- FROM {EDGE_SOURCE} r +-- LEFT JOIN VW_ONT_ALL_ENTITIES src ON r.SRC_ID = src.ENTITY_ID +-- LEFT JOIN VW_ONT_ALL_ENTITIES dst ON r.DST_ID = dst.ENTITY_ID; + +-- Hierarchy stats view: +-- CREATE OR REPLACE VIEW VW_ONT_HIERARCHY_STATS AS +-- SELECT +-- e.ENTITY_ID, +-- e.ENTITY_NAME, +-- e.ENTITY_TYPE, +-- COUNT(DISTINCT child_rel.DST_ID) AS DIRECT_CHILDREN, +-- COUNT(DISTINCT parent_rel.SRC_ID) AS DIRECT_PARENTS +-- FROM VW_ONT_ALL_ENTITIES e +-- LEFT JOIN REL_RESOLVED child_rel ON e.ENTITY_ID = child_rel.SRC_ID AND child_rel.REL_NAME = 'subClassOf' +-- LEFT JOIN REL_RESOLVED parent_rel ON e.ENTITY_ID = parent_rel.DST_ID AND parent_rel.REL_NAME = 'subClassOf' +-- GROUP BY e.ENTITY_ID, e.ENTITY_NAME, e.ENTITY_TYPE; diff --git a/skills/ontology-stack-builder/references/agent_config_template.json b/skills/ontology-stack-builder/references/agent_config_template.json new file mode 100644 index 00000000..f4fdbe0e --- /dev/null +++ b/skills/ontology-stack-builder/references/agent_config_template.json @@ -0,0 +1,39 @@ +{ + "model_name": "claude-3-5-sonnet", + "tool_choice": "auto", + "max_tokens": 4096, + "tools": [ + { + "tool_spec": { + "type": "cortex_analyst_tool", + "name": "kg_analyst", + "description": "Use for CONCRETE entity queries: specific player stats, club details, match results, coach history. Fast direct lookups on physical tables. Best for: 'Who scored most goals?', 'Which club has most wins?', named entity queries.", + "semantic_model_file": "@{STAGE}/{KG_MODEL_FILE}" + } + }, + { + "tool_spec": { + "type": "cortex_analyst_tool", + "name": "ontology_analyst", + "description": "Use for ABSTRACT cross-type queries: entity unification, type hierarchy traversal, polymorphic queries. Queries abstract views (VW_ONT_*). Best for: 'What types of entities exist?', 'Show all people', cross-entity reasoning.", + "semantic_model_file": "@{STAGE}/{ONTOLOGY_MODEL_FILE}" + } + }, + { + "tool_spec": { + "type": "cortex_analyst_tool", + "name": "metadata_analyst", + "description": "Use for GOVERNANCE and INTROSPECTION queries: ontology structure, class definitions, relation definitions, property catalogs. Best for: 'What classes are defined?', 'How are entities related?', schema questions.", + "semantic_model_file": "@{STAGE}/{METADATA_MODEL_FILE}" + } + } + ], + "tool_resources": { + "cortex_analyst_tool": { + "kg_analyst": { "semantic_model_file": "@{STAGE}/{KG_MODEL_FILE}" }, + "ontology_analyst": { "semantic_model_file": "@{STAGE}/{ONTOLOGY_MODEL_FILE}" }, + "metadata_analyst": { "semantic_model_file": "@{STAGE}/{METADATA_MODEL_FILE}" } + } + }, + "instructions": "You are an intelligent data assistant with access to multiple semantic models organized by intent.\n\nMODEL SELECTION STRATEGY:\n1. CONCRETE queries (named entities, specific lookups, aggregations) → kg_analyst\n2. ABSTRACT queries (cross-type, hierarchy, unification, 'what types') → ontology_analyst\n3. GOVERNANCE queries (schema structure, class definitions, metadata) → metadata_analyst\n\nRULES:\n- Select the MOST SPECIFIC model that can answer the question\n- If unsure, start with kg_analyst (most common)\n- For multi-part questions, you may call multiple tools\n- Always explain which model you chose and why\n- Present results in clear tables or lists" +} diff --git a/skills/ontology-stack-builder/references/metadata_tables_template.sql b/skills/ontology-stack-builder/references/metadata_tables_template.sql new file mode 100644 index 00000000..4c47ac36 --- /dev/null +++ b/skills/ontology-stack-builder/references/metadata_tables_template.sql @@ -0,0 +1,80 @@ +-- ============================================================================ +-- Ontology Metadata Tables Template +-- Replace {DATABASE}, {SCHEMA}, {ONTOLOGY_NAME} with actual values +-- ============================================================================ + +USE SCHEMA {DATABASE}.{SCHEMA}; + +-- ONT_CLASS: Ontology class hierarchy +CREATE TABLE IF NOT EXISTS ONT_CLASS ( + CLASS_NAME STRING NOT NULL PRIMARY KEY, + PARENT_CLASS_NAME STRING, + IS_ABSTRACT BOOLEAN DEFAULT FALSE, + DESCRIPTION STRING, + ONTOLOGY_NAME STRING DEFAULT '{ONTOLOGY_NAME}', + TYPE_CLASS STRING DEFAULT 'ANALYTICAL', + CREATED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(), + UPDATED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP() +); + +-- ONT_RELATION_DEF: Relationship definitions +CREATE TABLE IF NOT EXISTS ONT_RELATION_DEF ( + REL_NAME STRING NOT NULL PRIMARY KEY, + DOMAIN_CLASS STRING NOT NULL, + RANGE_CLASS STRING NOT NULL, + CARDINALITY STRING DEFAULT 'N:N', + IS_HIERARCHICAL BOOLEAN DEFAULT FALSE, + IS_TRANSITIVE BOOLEAN DEFAULT FALSE, + INVERSE_REL_NAME STRING, + DESCRIPTION STRING, + ONTOLOGY_NAME STRING DEFAULT '{ONTOLOGY_NAME}', + CREATED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP() +); + +-- ONT_CLASS_MAP: Map ontology classes to physical tables +CREATE TABLE IF NOT EXISTS ONT_CLASS_MAP ( + MAP_ID STRING DEFAULT UUID_STRING() PRIMARY KEY, + CLASS_NAME STRING NOT NULL, + SOURCE_DATABASE STRING NOT NULL, + SOURCE_SCHEMA STRING NOT NULL, + SOURCE_TABLE STRING NOT NULL, + FILTER_COL STRING, + FILTER_VAL STRING, + ID_EXPR STRING NOT NULL, + NAME_EXPR STRING, + SUBTYPE_EXPR STRING, + PROPS_EXPR STRING, + ONTOLOGY_NAME STRING DEFAULT '{ONTOLOGY_NAME}', + CREATED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(), + FOREIGN KEY (CLASS_NAME) REFERENCES ONT_CLASS(CLASS_NAME) +); + +-- ONT_REL_MAP: Map relationships to physical edge sources +CREATE TABLE IF NOT EXISTS ONT_REL_MAP ( + MAP_ID STRING DEFAULT UUID_STRING() PRIMARY KEY, + REL_NAME STRING NOT NULL, + SOURCE_DATABASE STRING NOT NULL, + SOURCE_SCHEMA STRING NOT NULL, + SOURCE_TABLE STRING NOT NULL, + SRC_ID_EXPR STRING NOT NULL, + DST_ID_EXPR STRING NOT NULL, + FILTER_COL STRING, + FILTER_VAL STRING, + WEIGHT_EXPR STRING DEFAULT '1.0', + PROPS_EXPR STRING, + ONTOLOGY_NAME STRING DEFAULT '{ONTOLOGY_NAME}', + CREATED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(), + FOREIGN KEY (REL_NAME) REFERENCES ONT_RELATION_DEF(REL_NAME) +); + +-- ONT_PROPERTY: Shared properties across classes +CREATE TABLE IF NOT EXISTS ONT_PROPERTY ( + PROPERTY_NAME STRING NOT NULL, + CLASS_NAME STRING NOT NULL, + DATA_TYPE STRING DEFAULT 'STRING', + IS_REQUIRED BOOLEAN DEFAULT FALSE, + DESCRIPTION STRING, + SOURCE_EXPR STRING, + ONTOLOGY_NAME STRING DEFAULT '{ONTOLOGY_NAME}', + PRIMARY KEY (PROPERTY_NAME, CLASS_NAME) +); diff --git a/skills/ontology-stack-builder/references/physical_layer_template.sql b/skills/ontology-stack-builder/references/physical_layer_template.sql new file mode 100644 index 00000000..9d5fb432 --- /dev/null +++ b/skills/ontology-stack-builder/references/physical_layer_template.sql @@ -0,0 +1,58 @@ +-- ============================================================================ +-- Physical Layer Template (KG Path Only) +-- Creates KG_NODE and KG_EDGE tables for Knowledge Graph storage +-- Replace {DATABASE}, {SCHEMA} with actual values +-- ============================================================================ + +USE SCHEMA {DATABASE}.{SCHEMA}; + +-- KG_NODE: Universal node table for all entity types +CREATE OR REPLACE TABLE KG_NODE ( + NODE_ID STRING NOT NULL, + NODE_TYPE STRING NOT NULL, + NAME STRING, + PROPS VARIANT, + TS_INGESTED TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(), + CONSTRAINT PK_KG_NODE PRIMARY KEY (NODE_ID) +) CLUSTER BY (NODE_TYPE); + +-- KG_EDGE: Universal edge table for all relationship types +CREATE OR REPLACE TABLE KG_EDGE ( + EDGE_ID STRING NOT NULL, + SRC_ID STRING NOT NULL, + DST_ID STRING NOT NULL, + EDGE_TYPE STRING NOT NULL, + WEIGHT FLOAT DEFAULT 1.0, + PROPS VARIANT, + EFFECTIVE_START DATE, + EFFECTIVE_END DATE, + TS_INGESTED TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(), + CONSTRAINT PK_KG_EDGE PRIMARY KEY (EDGE_ID), + CONSTRAINT FK_EDGE_SRC FOREIGN KEY (SRC_ID) REFERENCES KG_NODE(NODE_ID), + CONSTRAINT FK_EDGE_DST FOREIGN KEY (DST_ID) REFERENCES KG_NODE(NODE_ID) +) CLUSTER BY (EDGE_TYPE, SRC_ID, DST_ID); + +-- Load pattern for nodes (one INSERT per source table): +-- INSERT INTO KG_NODE (NODE_ID, NODE_TYPE, NAME, PROPS) +-- SELECT +-- {ID_EXPR}::STRING AS NODE_ID, +-- '{CLASS_NAME}' AS NODE_TYPE, +-- {NAME_EXPR}::STRING AS NAME, +-- OBJECT_CONSTRUCT( +-- 'col1', COL1, +-- 'col2', COL2 +-- ) AS PROPS +-- FROM {SOURCE_TABLE} +-- WHERE {FILTER_CONDITION}; -- omit if no filter + +-- Load pattern for edges (one INSERT per relationship): +-- INSERT INTO KG_EDGE (EDGE_ID, SRC_ID, DST_ID, EDGE_TYPE, WEIGHT, PROPS) +-- SELECT +-- UUID_STRING() AS EDGE_ID, +-- {SRC_ID_EXPR}::STRING AS SRC_ID, +-- {DST_ID_EXPR}::STRING AS DST_ID, +-- '{REL_NAME}' AS EDGE_TYPE, +-- {WEIGHT_EXPR} AS WEIGHT, +-- {PROPS_EXPR} AS PROPS +-- FROM {SOURCE_TABLE} +-- WHERE {FILTER_CONDITION}; -- omit if no filter diff --git a/skills/ontology-stack-builder/references/semantic_model_template.yaml b/skills/ontology-stack-builder/references/semantic_model_template.yaml new file mode 100644 index 00000000..0bae6d4f --- /dev/null +++ b/skills/ontology-stack-builder/references/semantic_model_template.yaml @@ -0,0 +1,58 @@ +# Semantic Model Template for Ontology-on-Snowflake +# Replace placeholders: {DATABASE}, {SCHEMA}, {MODEL_NAME}, {DESCRIPTION} + +name: "{MODEL_NAME}" +description: > + {DESCRIPTION} + +tables: + # Pattern for entity view table: + # - name: "VW_ONT_{CLASS_NAME}" + # base_table: + # database: "{DATABASE}" + # schema: "{SCHEMA}" + # table: "VW_ONT_{CLASS_NAME}" + # primary_key: + # columns: ["ENTITY_ID"] + # dimensions: + # - name: "ENTITY_ID" + # expr: "ENTITY_ID" + # data_type: "VARCHAR" + # description: "Unique identifier" + # - name: "ENTITY_TYPE" + # expr: "ENTITY_TYPE" + # data_type: "VARCHAR" + # description: "Ontology class type" + # - name: "ENTITY_NAME" + # expr: "ENTITY_NAME" + # data_type: "VARCHAR" + # description: "Display name" + + # Pattern for relationship table: + # - name: "REL_RESOLVED" + # base_table: + # database: "{DATABASE}" + # schema: "{SCHEMA}" + # table: "REL_RESOLVED" + # dimensions: + # - name: "REL_NAME" + # expr: "REL_NAME" + # data_type: "VARCHAR" + # - name: "SRC_NAME" + # expr: "SRC_NAME" + # data_type: "VARCHAR" + # - name: "DST_NAME" + # expr: "DST_NAME" + # data_type: "VARCHAR" + # facts: + # - name: "WEIGHT" + # expr: "WEIGHT" + # data_type: "FLOAT" + +verified_queries: + # Pattern: + # - name: "query_name" + # question: "Natural language question" + # sql: "SELECT ... FROM ... WHERE ..." + # verified_at: 1700000000 + # verified_by: "ontology-stack-builder" diff --git a/skills/ontology-stack-builder/scripts/generate_agent_config.py b/skills/ontology-stack-builder/scripts/generate_agent_config.py new file mode 100644 index 00000000..b9990144 --- /dev/null +++ b/skills/ontology-stack-builder/scripts/generate_agent_config.py @@ -0,0 +1,599 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pyyaml>=6.0", +# ] +# /// +""" +DEPRECATED: This script is no longer used by the ontology-stack-builder workflow. + +Agent creation is now handled by the native Cortex Code `cortex-agent` skill, +which uses CREATE OR REPLACE AGENT ... FROM SPECIFICATION $spec$...$spec$ DDL +with cortex_analyst_text_to_sql tools and proper tool_resources configuration. + +SPCS graph analytics scaffolding has been moved to generate_spcs_scaffolding.py. + +This file is retained for reference only. Do not invoke it from the skill workflow. +See SKILL.md Phase 6 for the current workflow. +""" + +import argparse +import json +import sys +from pathlib import Path + +import yaml + + +# --- Intent-routing descriptions for each model type --- +MODEL_DESCRIPTIONS = { + "kg": ( + "Use for CONCRETE entity queries: specific lookups, named entity searches, " + "aggregations on physical data, direct record retrieval. " + "Best for: 'Who scored most goals?', 'Which customer spent most?', " + "'Show details for X', entity-specific questions." + ), + "ontology": ( + "Use for ABSTRACT cross-type queries: entity unification, type hierarchy traversal, " + "polymorphic queries across entity types, 'what types of X exist?'. " + "Best for: 'What types of entities exist?', 'Show all people regardless of role', " + "'How are different entity types connected?', cross-entity reasoning." + ), + "metadata": ( + "Use for GOVERNANCE and INTROSPECTION queries: ontology structure, class definitions, " + "relationship types, mapping catalog, schema documentation. " + "Best for: 'What classes are defined?', 'How is the ontology structured?', " + "'What tables map to which classes?', data catalog questions." + ), +} + +# Tool name mapping +MODEL_TOOL_NAMES = { + "kg": "kg_analyst", + "ontology": "ontology_analyst", + "metadata": "metadata_analyst", +} + + +def detect_model_type(model_path: Path) -> str | None: + """Detect model type from filename.""" + name = model_path.stem.lower() + if "kg" in name and "ontology" not in name: + return "kg" + elif "ontology" in name: + return "ontology" + elif "metadata" in name or "governance" in name: + return "metadata" + return None + + +def build_agent_tools( + model_files: list[Path], + stage_name: str, +) -> list[dict]: + """Build cortex_analyst_tool entries for each semantic model.""" + tools = [] + + for mf in model_files: + model_type = detect_model_type(mf) + if model_type is None: + # Fallback: use filename as tool name + tool_name = mf.stem.replace("-", "_").replace(" ", "_").lower() + description = f"Semantic model from {mf.name}. Use for queries against this model." + else: + tool_name = MODEL_TOOL_NAMES[model_type] + description = MODEL_DESCRIPTIONS[model_type] + + tools.append({ + "tool_spec": { + "type": "cortex_analyst_tool", + "name": tool_name, + "description": description, + "semantic_model_file": f"{stage_name}/{mf.name}", + } + }) + + return tools + + +def build_graph_tools(database: str, schema: str) -> list[dict]: + """Build graph analytics tool entries (requires SPCS service).""" + service_name = f"{database}.{schema}.GRAPH_ANALYTICS_SERVICE" + + return [ + { + "tool_spec": { + "type": "function", + "name": "graph_centrality", + "description": ( + "Calculate centrality metrics for entities in the knowledge graph. " + "Returns top N most central/important entities. Use for: " + "'Who is most connected?', 'Most influential entity', 'Key players'." + ), + "function": { + "name": f"{database}.{schema}.GRAPH_CENTRALITY", + "description": "Compute centrality (degree, betweenness, or PageRank) for graph nodes", + "parameters": { + "type": "object", + "properties": { + "metric": { + "type": "string", + "enum": ["degree", "betweenness", "pagerank"], + "description": "Centrality metric to compute", + }, + "entity_type": { + "type": "string", + "description": "Optional: filter to specific entity type", + }, + "top_n": { + "type": "integer", + "description": "Number of top results to return (default 10)", + }, + }, + "required": ["metric"], + }, + }, + } + }, + { + "tool_spec": { + "type": "function", + "name": "graph_community_detection", + "description": ( + "Detect communities/clusters in the knowledge graph using Louvain algorithm. " + "Returns community assignments and sizes. Use for: " + "'What groups exist?', 'Find clusters', 'Community structure'." + ), + "function": { + "name": f"{database}.{schema}.GRAPH_COMMUNITY_DETECTION", + "description": "Detect communities using Louvain method", + "parameters": { + "type": "object", + "properties": { + "resolution": { + "type": "number", + "description": "Resolution parameter (higher = more communities, default 1.0)", + }, + }, + }, + }, + } + }, + { + "tool_spec": { + "type": "function", + "name": "graph_shortest_path", + "description": ( + "Find the shortest path between two entities in the knowledge graph. " + "Returns the path with intermediate nodes and relationship types. Use for: " + "'How is A connected to B?', 'Path between X and Y', 'Degrees of separation'." + ), + "function": { + "name": f"{database}.{schema}.GRAPH_SHORTEST_PATH", + "description": "Find shortest path between two entities", + "parameters": { + "type": "object", + "properties": { + "source_id": { + "type": "string", + "description": "Source entity ID", + }, + "target_id": { + "type": "string", + "description": "Target entity ID", + }, + }, + "required": ["source_id", "target_id"], + }, + }, + } + }, + ] + + +def build_instructions(model_types: list[str], include_graph: bool) -> str: + """Build agent orchestration instructions.""" + lines = [ + "You are an intelligent data assistant with access to multiple semantic models organized by intent.", + "", + "MODEL SELECTION STRATEGY:", + ] + + if "kg" in model_types: + lines.append("1. CONCRETE queries (named entities, specific lookups, aggregations) → kg_analyst") + if "ontology" in model_types: + lines.append(f"{'2' if 'kg' in model_types else '1'}. ABSTRACT queries (cross-type, hierarchy, unification, 'what types') → ontology_analyst") + if "metadata" in model_types: + idx = len([t for t in ["kg", "ontology"] if t in model_types]) + 1 + lines.append(f"{idx}. GOVERNANCE queries (schema structure, class definitions, metadata) → metadata_analyst") + + if include_graph: + lines.extend([ + "", + "GRAPH ANALYTICS:", + "- For importance/influence questions → graph_centrality", + "- For grouping/clustering questions → graph_community_detection", + "- For connection/path questions → graph_shortest_path", + ]) + + lines.extend([ + "", + "RULES:", + "- Select the MOST SPECIFIC model that can answer the question", + "- If unsure, start with " + ("kg_analyst" if "kg" in model_types else "ontology_analyst") + " (most common)", + "- For multi-part questions, you may call multiple tools", + "- Always explain which model you chose and why", + "- Present results in clear tables or lists", + ]) + + return "\n".join(lines) + + +def generate_spcs_graph_service(database: str, schema: str) -> str: + """Generate the Python graph analytics service for SPCS.""" + return '''""" +Graph Analytics Service for Snowpark Container Services. + +Provides NetworkX-based graph analytics exposed via Flask endpoints. +Reads KG_NODE and KG_EDGE tables from Snowflake to build the graph. +""" + +import os +import json +import logging +from flask import Flask, request, jsonify + +import networkx as nx +from snowflake.snowpark import Session +from community import community_louvain + +app = Flask(__name__) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Global graph cache +_graph: nx.Graph | None = None +_node_attrs: dict = {} + + +def get_session() -> Session: + """Create Snowpark session from SPCS environment.""" + return Session.builder.configs({ + "account": os.environ["SNOWFLAKE_ACCOUNT"], + "host": os.environ["SNOWFLAKE_HOST"], + "token": open("/snowflake/session/token").read().strip(), + "authenticator": "oauth", + "database": "''' + database + '''", + "schema": "''' + schema + '''", + }).create() + + +def load_graph() -> nx.Graph: + """Load KG_NODE and KG_EDGE into a NetworkX graph.""" + global _graph, _node_attrs + if _graph is not None: + return _graph + + logger.info("Loading graph from KG_NODE/KG_EDGE...") + session = get_session() + + nodes_df = session.sql("SELECT NODE_ID, NODE_TYPE, NAME FROM KG_NODE").collect() + edges_df = session.sql("SELECT SRC_ID, DST_ID, EDGE_TYPE, WEIGHT FROM KG_EDGE WHERE EFFECTIVE_END IS NULL").collect() + + G = nx.Graph() + for row in nodes_df: + G.add_node(row["NODE_ID"], node_type=row["NODE_TYPE"], name=row["NAME"]) + _node_attrs[row["NODE_ID"]] = {"type": row["NODE_TYPE"], "name": row["NAME"]} + + for row in edges_df: + G.add_edge(row["SRC_ID"], row["DST_ID"], + edge_type=row["EDGE_TYPE"], + weight=float(row["WEIGHT"] or 1.0)) + + _graph = G + logger.info(f"Graph loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges") + session.close() + return G + + +@app.route("/health", methods=["GET"]) +def health(): + return jsonify({"status": "healthy"}) + + +@app.route("/centrality", methods=["POST"]) +def centrality(): + """Compute centrality metrics.""" + data = request.json + metric = data.get("metric", "degree") + entity_type = data.get("entity_type") + top_n = data.get("top_n", 10) + + G = load_graph() + + if entity_type: + nodes = [n for n, d in G.nodes(data=True) if d.get("node_type") == entity_type] + subgraph = G.subgraph(nodes) + else: + subgraph = G + + if metric == "degree": + scores = nx.degree_centrality(subgraph) + elif metric == "betweenness": + scores = nx.betweenness_centrality(subgraph, k=min(100, len(subgraph))) + elif metric == "pagerank": + scores = nx.pagerank(subgraph) + else: + return jsonify({"error": f"Unknown metric: {metric}"}), 400 + + top = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n] + results = [] + for node_id, score in top: + attrs = _node_attrs.get(node_id, {}) + results.append({ + "node_id": node_id, + "name": attrs.get("name", ""), + "type": attrs.get("type", ""), + "score": round(score, 6), + }) + + return jsonify({"metric": metric, "results": results}) + + +@app.route("/community", methods=["POST"]) +def community_detection(): + """Detect communities using Louvain.""" + data = request.json or {} + resolution = data.get("resolution", 1.0) + + G = load_graph() + partition = community_louvain.best_partition(G, resolution=resolution) + + communities = {} + for node_id, comm_id in partition.items(): + if comm_id not in communities: + communities[comm_id] = [] + attrs = _node_attrs.get(node_id, {}) + communities[comm_id].append({ + "node_id": node_id, + "name": attrs.get("name", ""), + "type": attrs.get("type", ""), + }) + + summary = [{"community_id": cid, "size": len(members), "sample": members[:5]} + for cid, members in sorted(communities.items(), key=lambda x: -len(x[1]))] + + return jsonify({"num_communities": len(communities), "communities": summary[:20]}) + + +@app.route("/shortest_path", methods=["POST"]) +def shortest_path(): + """Find shortest path between two nodes.""" + data = request.json + source = data.get("source_id") + target = data.get("target_id") + + if not source or not target: + return jsonify({"error": "source_id and target_id are required"}), 400 + + G = load_graph() + + if source not in G: + return jsonify({"error": f"Source node {source} not found"}), 404 + if target not in G: + return jsonify({"error": f"Target node {target} not found"}), 404 + + try: + path = nx.shortest_path(G, source, target) + except nx.NetworkXNoPath: + return jsonify({"error": "No path exists between these nodes", "path": []}) + + path_details = [] + for i, node_id in enumerate(path): + attrs = _node_attrs.get(node_id, {}) + entry = {"step": i, "node_id": node_id, "name": attrs.get("name", ""), "type": attrs.get("type", "")} + if i < len(path) - 1: + edge_data = G.edges[node_id, path[i + 1]] + entry["edge_type"] = edge_data.get("edge_type", "") + path_details.append(entry) + + return jsonify({"length": len(path) - 1, "path": path_details}) + + +@app.route("/reload", methods=["POST"]) +def reload_graph(): + """Force reload the graph from Snowflake.""" + global _graph, _node_attrs + _graph = None + _node_attrs = {} + load_graph() + return jsonify({"status": "reloaded", "nodes": _graph.number_of_nodes(), "edges": _graph.number_of_edges()}) + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=8080) +''' + + +def generate_spcs_setup_sql(database: str, schema: str) -> str: + """Generate SPCS setup SQL for graph analytics.""" + return f"""-- ============================================================================= +-- SPCS Graph Analytics Setup +-- Generated by ontology-stack-builder +-- ============================================================================= + +-- 1. Create compute pool (adjust size as needed) +CREATE COMPUTE POOL IF NOT EXISTS {schema}_GRAPH_POOL + MIN_NODES = 1 + MAX_NODES = 1 + INSTANCE_FAMILY = CPU_X64_XS + AUTO_RESUME = TRUE + AUTO_SUSPEND_SECS = 300; + +-- 2. Create image repository +CREATE IMAGE REPOSITORY IF NOT EXISTS {database}.{schema}.GRAPH_IMAGES; + +-- 3. Build and push the Docker image: +-- docker build -t graph-analytics . +-- docker tag graph-analytics /graph-analytics:latest +-- docker push /graph-analytics:latest +-- +-- Get repo URL with: +-- SHOW IMAGE REPOSITORIES IN SCHEMA {database}.{schema}; + +-- 4. Create the service +CREATE SERVICE IF NOT EXISTS {database}.{schema}.GRAPH_ANALYTICS_SERVICE + IN COMPUTE POOL {schema}_GRAPH_POOL + MIN_INSTANCES = 1 + MAX_INSTANCES = 1 + FROM SPECIFICATION $$ + spec: + containers: + - name: graph-analytics + image: /{{repo_url}}/graph-analytics:latest + resources: + requests: + cpu: 1 + memory: 2Gi + limits: + cpu: 2 + memory: 4Gi + env: + SNOWFLAKE_ACCOUNT: {{{{SNOWFLAKE_ACCOUNT}}}} + SNOWFLAKE_HOST: {{{{SNOWFLAKE_HOST}}}} + readinessProbe: + path: /health + port: 8080 + endpoints: + - name: graph-api + port: 8080 + public: false + $$; + +-- 5. Create service functions +CREATE OR REPLACE FUNCTION {database}.{schema}.GRAPH_CENTRALITY( + metric VARCHAR, + entity_type VARCHAR DEFAULT NULL, + top_n INTEGER DEFAULT 10 +) +RETURNS VARIANT +SERVICE = {database}.{schema}.GRAPH_ANALYTICS_SERVICE +ENDPOINT = 'graph-api' +AS '/centrality'; + +CREATE OR REPLACE FUNCTION {database}.{schema}.GRAPH_COMMUNITY_DETECTION( + resolution FLOAT DEFAULT 1.0 +) +RETURNS VARIANT +SERVICE = {database}.{schema}.GRAPH_ANALYTICS_SERVICE +ENDPOINT = 'graph-api' +AS '/community'; + +CREATE OR REPLACE FUNCTION {database}.{schema}.GRAPH_SHORTEST_PATH( + source_id VARCHAR, + target_id VARCHAR +) +RETURNS VARIANT +SERVICE = {database}.{schema}.GRAPH_ANALYTICS_SERVICE +ENDPOINT = 'graph-api' +AS '/shortest_path'; + +-- 6. Grant usage +-- GRANT USAGE ON FUNCTION {database}.{schema}.GRAPH_CENTRALITY(VARCHAR, VARCHAR, INTEGER) TO ROLE ; +-- GRANT USAGE ON FUNCTION {database}.{schema}.GRAPH_COMMUNITY_DETECTION(FLOAT) TO ROLE ; +-- GRANT USAGE ON FUNCTION {database}.{schema}.GRAPH_SHORTEST_PATH(VARCHAR, VARCHAR) TO ROLE ; + +-- 7. Test +-- SELECT {database}.{schema}.GRAPH_CENTRALITY('pagerank', NULL, 5); +-- SELECT {database}.{schema}.GRAPH_COMMUNITY_DETECTION(1.0); +-- SELECT {database}.{schema}.GRAPH_SHORTEST_PATH('node_1', 'node_2'); +""" + + +def main(): + parser = argparse.ArgumentParser(description="Generate Cortex Agent Config") + parser.add_argument("--semantic-models", required=True, + help="Comma-separated paths to semantic model YAML files") + parser.add_argument("--database", required=True) + parser.add_argument("--schema", required=True) + parser.add_argument("--stage-name", default=None, + help="Snowflake stage name (default: @DATABASE.SCHEMA.ONTOLOGY_STAGE)") + parser.add_argument("--include-graph-tools", default="false", + help="Include graph analytics tools (true/false)") + parser.add_argument("--output-dir", required=True) + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + database = args.database + schema = args.schema + stage_name = args.stage_name or f"@{database}.{schema}.ONTOLOGY_STAGE" + # Normalize: ensure stage_name starts with exactly one @ + stage_name = "@" + stage_name.lstrip("@") + include_graph = args.include_graph_tools.lower() in ("true", "1", "yes") + + # Parse model file paths + model_paths = [] + for p in args.semantic_models.split(","): + p = p.strip() + if p: + path = Path(p) + if path.exists(): + model_paths.append(path) + else: + print(f" Warning: semantic model file not found: {p}", file=sys.stderr) + + if not model_paths: + print("Error: No valid semantic model files found", file=sys.stderr) + sys.exit(1) + + # Detect model types + model_types = [] + for mp in model_paths: + mt = detect_model_type(mp) + if mt: + model_types.append(mt) + + print(f"Building agent config for {len(model_paths)} semantic model(s)") + print(f" Models: {', '.join(mp.name for mp in model_paths)}") + print(f" Graph tools: {'yes' if include_graph else 'no'}") + + # Build tools list + tools = build_agent_tools(model_paths, stage_name) + + if include_graph: + tools.extend(build_graph_tools(database, schema)) + + # Build instructions + instructions = build_instructions(model_types, include_graph) + + # Assemble agent config + agent_config = { + "model_name": "claude-3-5-sonnet", + "tool_choice": "auto", + "max_tokens": 4096, + "tools": tools, + "instructions": instructions, + } + + # Write agent config + config_path = output_dir / "cortex_agent_config.json" + config_path.write_text(json.dumps(agent_config, indent=2, ensure_ascii=False)) + print(f" Wrote {config_path} ({len(tools)} tools)") + + # Generate SPCS scaffolding if graph tools requested + if include_graph: + service_path = output_dir / "spcs_graph_service.py" + service_path.write_text(generate_spcs_graph_service(database, schema)) + print(f" Wrote {service_path}") + + setup_path = output_dir / "spcs_setup.sql" + setup_path.write_text(generate_spcs_setup_sql(database, schema)) + print(f" Wrote {setup_path}") + + print(f"\nAgent config generation complete") + + +if __name__ == "__main__": + main() diff --git a/skills/ontology-stack-builder/scripts/generate_ontology_sql.py b/skills/ontology-stack-builder/scripts/generate_ontology_sql.py new file mode 100644 index 00000000..b88f4b3c --- /dev/null +++ b/skills/ontology-stack-builder/scripts/generate_ontology_sql.py @@ -0,0 +1,1619 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pyyaml>=6.0", +# ] +# /// +""" +Generate Ontology SQL - Produces SQL scripts for Layers 1-3 of the Ontology-on-Snowflake stack. + +Generates: + - 01_physical_layer.sql (KG path only): KG_NODE/KG_EDGE tables + data load + - 02_concrete_views.sql: V_{CLASS} typed entity views, V_{REL} relationship views + - 03_metadata_tables.sql: ONT_CLASS, ONT_RELATION_DEF, ONT_CLASS_MAP, ONT_REL_MAP + expanded tables + inserts + - 04_abstract_views.sql: VW_ONT_* views, REL_RESOLVED, VW_ONT_ALL_ENTITIES + - 05_view_generator_sp.sql: SP_GENERATE_ONTOLOGY_VIEWS() stored procedure + +Usage: + uv run --project python /scripts/generate_ontology_sql.py \ + --classes-json /tmp/ontology_parsed/classes.json \ + --relations-json /tmp/ontology_parsed/relations.json \ + --mappings-json /tmp/ontology_parsed/mappings.json \ + --database MYDB --schema MYSCHEMA --ontology-name DOMAIN \ + --kg-path true \ + --output-dir /tmp/generated +""" + +import argparse +import json +import sys +from datetime import datetime, timezone +from pathlib import Path + + +def sql_escape(val: str | None) -> str: + if val is None: + return "NULL" + return "'" + str(val).replace("'", "''") + "'" + + +def generate_physical_layer_sql( + classes: list[dict], + relations: list[dict], + mappings: dict, + database: str, + schema: str, +) -> str: + """Generate KG_NODE/KG_EDGE DDL and data load INSERTs.""" + fqn = f"{database}.{schema}" + lines = [] + lines.append(f"-- {'='*76}") + lines.append(f"-- Layer 1: Physical KG Tables") + lines.append(f"-- Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}") + lines.append(f"-- {'='*76}\n") + lines.append(f"USE SCHEMA {fqn};\n") + + # KG_NODE + lines.append("CREATE OR REPLACE TABLE KG_NODE (") + lines.append(" NODE_ID STRING NOT NULL,") + lines.append(" NODE_TYPE STRING NOT NULL,") + lines.append(" NAME STRING,") + lines.append(" PROPS VARIANT,") + lines.append(" TS_INGESTED TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),") + lines.append(" CONSTRAINT PK_KG_NODE PRIMARY KEY (NODE_ID)") + lines.append(") CLUSTER BY (NODE_TYPE);\n") + + # KG_EDGE + lines.append("CREATE OR REPLACE TABLE KG_EDGE (") + lines.append(" EDGE_ID STRING NOT NULL,") + lines.append(" SRC_ID STRING NOT NULL,") + lines.append(" DST_ID STRING NOT NULL,") + lines.append(" EDGE_TYPE STRING NOT NULL,") + lines.append(" WEIGHT FLOAT DEFAULT 1.0,") + lines.append(" PROPS VARIANT,") + lines.append(" EFFECTIVE_START DATE,") + lines.append(" EFFECTIVE_END DATE,") + lines.append(" TS_INGESTED TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),") + lines.append(" CONSTRAINT PK_KG_EDGE PRIMARY KEY (EDGE_ID)") + lines.append(") CLUSTER BY (EDGE_TYPE, SRC_ID, DST_ID);\n") + + # Node load inserts + class_maps = mappings.get("class_mappings", []) + for cm in class_maps: + cls_name = cm["class_name"] + src_table = cm["source_table"] + id_col = cm["id_column"] + name_col = cm.get("name_column") or id_col + + lines.append(f"-- Load {cls_name} nodes from {src_table}") + lines.append(f"INSERT INTO KG_NODE (NODE_ID, NODE_TYPE, NAME, PROPS)") + lines.append(f"SELECT") + lines.append(f" {id_col}::STRING AS NODE_ID,") + lines.append(f" '{cls_name}' AS NODE_TYPE,") + lines.append(f" {name_col}::STRING AS NAME,") + lines.append(f" OBJECT_CONSTRUCT(*) AS PROPS") + lines.append(f"FROM {src_table};") + lines.append("") + + # Edge load inserts + rel_maps = mappings.get("relation_mappings", []) + for rm in rel_maps: + rel_name = rm["rel_name"] + src_table = rm["source_table"] + src_col = rm.get("src_column") or rm.get("src_id_column", "SRC_ID") + dst_col = rm.get("dst_column") or rm.get("dst_id_column", "DST_ID") + + lines.append(f"-- Load {rel_name} edges from {src_table}") + lines.append(f"INSERT INTO KG_EDGE (EDGE_ID, SRC_ID, DST_ID, EDGE_TYPE)") + lines.append(f"SELECT") + lines.append(f" UUID_STRING() AS EDGE_ID,") + lines.append(f" {src_col}::STRING AS SRC_ID,") + lines.append(f" {dst_col}::STRING AS DST_ID,") + lines.append(f" '{rel_name}' AS EDGE_TYPE") + lines.append(f"FROM {src_table};") + lines.append("") + + return "\n".join(lines) + + +# Mapping from Snowflake column types to PROPS extraction cast types +SNOWFLAKE_TYPE_MAP = { + "NUMBER": "NUMBER", + "DECIMAL": "NUMBER", + "NUMERIC": "NUMBER", + "INT": "INTEGER", + "INTEGER": "INTEGER", + "BIGINT": "INTEGER", + "SMALLINT": "INTEGER", + "TINYINT": "INTEGER", + "BYTEINT": "INTEGER", + "FLOAT": "FLOAT", + "FLOAT4": "FLOAT", + "FLOAT8": "FLOAT", + "DOUBLE": "FLOAT", + "DOUBLE PRECISION": "FLOAT", + "REAL": "FLOAT", + "VARCHAR": "STRING", + "CHAR": "STRING", + "CHARACTER": "STRING", + "STRING": "STRING", + "TEXT": "STRING", + "BINARY": "STRING", + "VARBINARY": "STRING", + "BOOLEAN": "BOOLEAN", + "DATE": "DATE", + "DATETIME": "TIMESTAMP_NTZ", + "TIME": "TIME", + "TIMESTAMP": "TIMESTAMP_NTZ", + "TIMESTAMP_LTZ": "TIMESTAMP_LTZ", + "TIMESTAMP_NTZ": "TIMESTAMP_NTZ", + "TIMESTAMP_TZ": "TIMESTAMP_TZ", + "VARIANT": "VARIANT", + "OBJECT": "VARIANT", + "ARRAY": "VARIANT", +} + + +def _sf_cast_type(col_type: str) -> str: + """Map a Snowflake column type to a PROPS extraction cast type.""" + # Strip precision/scale like NUMBER(10,2) -> NUMBER + base = col_type.upper().split("(")[0].strip() + return SNOWFLAKE_TYPE_MAP.get(base, "STRING") + + +def generate_concrete_views_sql( + mappings: dict, + database: str, + schema: str, + kg_path: bool, +) -> str: + """Generate concrete typed entity views (V_{CLASS}) and relationship views. + + For KG path: V_{CLASS} extracts PROPS fields into typed columns from KG_NODE. + For direct path: V_{CLASS} is a thin wrapper over the source table with typed columns. + """ + fqn = f"{database}.{schema}" + lines = [] + lines.append(f"-- {'='*76}") + lines.append(f"-- Concrete Entity & Relationship Views") + lines.append(f"-- Typed views over KG_NODE/KG_EDGE (KG path) or source tables (direct path)") + lines.append(f"-- Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}") + lines.append(f"-- {'='*76}\n") + lines.append(f"USE SCHEMA {fqn};\n") + + class_maps = mappings.get("class_mappings", []) + + # Per-class concrete entity views + for cm in class_maps: + cls_name = cm["class_name"] + view_name = f"V_{cls_name.upper()}" + columns = cm.get("columns", []) + id_col = cm["id_column"] + name_col = cm.get("name_column") + + if kg_path: + # Extract typed properties from KG_NODE PROPS variant + lines.append(f"-- {view_name}: Typed view for {cls_name} (KG path)") + lines.append(f"CREATE OR REPLACE VIEW {view_name} AS") + lines.append(f"SELECT") + lines.append(f" NODE_ID,") + lines.append(f" NAME,") + + # Project each non-PK, non-FK column as PROPS:col_name::TYPE + prop_cols = [ + c for c in columns + if not c.get("is_primary_key") and not c.get("is_foreign_key") + and c["name"].upper() not in ("NODE_ID", "NODE_TYPE", "NAME", "PROPS", "TS_INGESTED") + ] + for col in prop_cols: + cast_type = _sf_cast_type(col["data_type"]) + col_lower = col["name"].lower() + lines.append(f" PROPS:{col_lower}::{cast_type} AS {col['name']},") + + lines.append(f" PROPS") + lines.append(f"FROM KG_NODE") + lines.append(f"WHERE NODE_TYPE = '{cls_name}';\n") + else: + # Direct table path: select all typed columns from source table + src = cm["source_table"] + lines.append(f"-- {view_name}: Typed view for {cls_name} (direct table path)") + lines.append(f"CREATE OR REPLACE VIEW {view_name} AS") + lines.append(f"SELECT") + + select_cols = [] + for col in columns: + select_cols.append(f" {col['name']}") + + if not select_cols: + # Fallback: SELECT * + lines.append(f" *") + else: + lines.append(",\n".join(select_cols)) + + lines.append(f"FROM {src};\n") + + # Per-relation concrete relationship views (KG path only — direct path uses source tables directly) + if kg_path: + rel_maps = mappings.get("relation_mappings", []) + for rm in rel_maps: + rel_name = rm["rel_name"] + view_name = f"V_{rel_name.upper()}" + extra_cols = rm.get("columns", []) + + lines.append(f"-- {view_name}: Relationship view for {rel_name} (KG path)") + lines.append(f"CREATE OR REPLACE VIEW {view_name} AS") + lines.append(f"SELECT") + lines.append(f" SRC_ID,") + lines.append(f" DST_ID,") + lines.append(f" EDGE_TYPE,") + lines.append(f" PROPS,") + lines.append(f" WEIGHT,") + lines.append(f" EFFECTIVE_START,") + lines.append(f" EFFECTIVE_END,") + + # Project extra columns from PROPS if available + for col in extra_cols: + cast_type = _sf_cast_type(col["data_type"]) + col_lower = col["name"].lower() + lines.append(f" PROPS:{col_lower}::{cast_type} AS {col['name']},") + + # Remove trailing comma from last PROPS extraction line if any extra cols were added + if extra_cols: + lines[-1] = lines[-1].rstrip(",") + else: + # Remove trailing comma from EFFECTIVE_END line + lines[-1] = lines[-1].rstrip(",") + + lines.append(f"FROM KG_EDGE") + lines.append(f"WHERE EDGE_TYPE = '{rel_name}';\n") + + return "\n".join(lines) + + +def generate_metadata_sql( + classes: list[dict], + relations: list[dict], + mappings: dict, + database: str, + schema: str, + ontology_name: str, + kg_path: bool, +) -> str: + """Generate metadata tables DDL and INSERT statements.""" + fqn = f"{database}.{schema}" + lines = [] + lines.append(f"-- {'='*76}") + lines.append(f"-- Layer 2: Ontology Metadata Tables for {ontology_name}") + lines.append(f"-- Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}") + lines.append(f"-- {'='*76}\n") + lines.append(f"USE SCHEMA {fqn};\n") + + # ONT_CLASS + lines.append("CREATE TABLE IF NOT EXISTS ONT_CLASS (") + lines.append(" CLASS_NAME STRING NOT NULL PRIMARY KEY,") + lines.append(" PARENT_CLASS_NAME STRING,") + lines.append(" IS_ABSTRACT BOOLEAN DEFAULT FALSE,") + lines.append(" DESCRIPTION STRING,") + lines.append(f" ONTOLOGY_NAME STRING DEFAULT {sql_escape(ontology_name)},") + lines.append(" TYPE_CLASS STRING DEFAULT 'ANALYTICAL',") + lines.append(" CREATED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()") + lines.append(");\n") + + if classes: + lines.append("INSERT INTO ONT_CLASS (CLASS_NAME, PARENT_CLASS_NAME, IS_ABSTRACT, DESCRIPTION, TYPE_CLASS)") + lines.append("SELECT * FROM VALUES") + rows = [] + for cls in classes: + if cls.get("is_deprecated"): + continue + name = cls.get("class_name") or cls.get("name") + parent = cls.get("parent_class_name") or cls.get("parent_name") + is_abs = cls.get("is_abstract", False) + desc = (cls.get("description") or cls.get("label") or "")[:500] + type_class = "ANALYTICAL" if is_abs else "OPERATIONAL" + rows.append( + f" ({sql_escape(name)}, {sql_escape(parent)}, {str(is_abs).upper()}, " + f"{sql_escape(desc)}, {sql_escape(type_class)})" + ) + lines.append(",\n".join(rows)) + lines.append("AS t(CLASS_NAME, PARENT_CLASS_NAME, IS_ABSTRACT, DESCRIPTION, TYPE_CLASS);") + lines.append("") + + # ONT_RELATION_DEF + lines.append("CREATE TABLE IF NOT EXISTS ONT_RELATION_DEF (") + lines.append(" REL_NAME STRING NOT NULL PRIMARY KEY,") + lines.append(" DOMAIN_CLASS STRING NOT NULL,") + lines.append(" RANGE_CLASS STRING NOT NULL,") + lines.append(" CARDINALITY STRING DEFAULT 'N:N',") + lines.append(" IS_HIERARCHICAL BOOLEAN DEFAULT FALSE,") + lines.append(" IS_TRANSITIVE BOOLEAN DEFAULT FALSE,") + lines.append(" INVERSE_REL_NAME STRING,") + lines.append(" DESCRIPTION STRING,") + lines.append(f" ONTOLOGY_NAME STRING DEFAULT {sql_escape(ontology_name)}") + lines.append(");\n") + + if relations: + lines.append("INSERT INTO ONT_RELATION_DEF (REL_NAME, DOMAIN_CLASS, RANGE_CLASS, CARDINALITY, IS_HIERARCHICAL, IS_TRANSITIVE, INVERSE_REL_NAME, DESCRIPTION)") + lines.append("SELECT * FROM VALUES") + rows = [] + for rel in relations: + rn = rel.get("rel_name") or rel.get("name") + rows.append( + f" ({sql_escape(rn)}, {sql_escape(rel.get('domain_class', 'Thing'))}, " + f"{sql_escape(rel.get('range_class', 'Thing'))}, {sql_escape(rel.get('cardinality', 'N:N'))}, " + f"{str(rel.get('is_hierarchical', False)).upper()}, {str(rel.get('is_transitive', False)).upper()}, " + f"{sql_escape(rel.get('inverse_name'))}, {sql_escape((rel.get('description') or '')[:500])})" + ) + lines.append(",\n".join(rows)) + lines.append("AS t(REL_NAME, DOMAIN_CLASS, RANGE_CLASS, CARDINALITY, IS_HIERARCHICAL, IS_TRANSITIVE, INVERSE_REL_NAME, DESCRIPTION);") + lines.append("") + + # ONT_CLASS_MAP + lines.append("CREATE TABLE IF NOT EXISTS ONT_CLASS_MAP (") + lines.append(" MAP_ID STRING DEFAULT UUID_STRING() PRIMARY KEY,") + lines.append(" CLASS_NAME STRING NOT NULL,") + lines.append(" SOURCE_DATABASE STRING NOT NULL,") + lines.append(" SOURCE_SCHEMA STRING NOT NULL,") + lines.append(" SOURCE_TABLE STRING NOT NULL,") + lines.append(" FILTER_COL STRING,") + lines.append(" FILTER_VAL STRING,") + lines.append(" ID_EXPR STRING NOT NULL,") + lines.append(" NAME_EXPR STRING,") + lines.append(" SUBTYPE_EXPR STRING,") + lines.append(f" ONTOLOGY_NAME STRING DEFAULT {sql_escape(ontology_name)}") + lines.append(");\n") + + class_maps = mappings.get("class_mappings", []) + if class_maps: + lines.append("INSERT INTO ONT_CLASS_MAP (CLASS_NAME, SOURCE_DATABASE, SOURCE_SCHEMA, SOURCE_TABLE, FILTER_COL, FILTER_VAL, ID_EXPR, NAME_EXPR)") + lines.append("SELECT * FROM VALUES") + rows = [] + for cm in class_maps: + # Parse source_table which may be fully qualified + src = cm["source_table"] + parts = src.split(".") + if len(parts) == 3: + src_db, src_schema, src_tbl = parts + else: + src_db, src_schema, src_tbl = database, schema, src + + if kg_path: + # In KG path, source is KG_NODE with filter + rows.append( + f" ({sql_escape(cm['class_name'])}, {sql_escape(database)}, {sql_escape(schema)}, " + f"'KG_NODE', 'NODE_TYPE', {sql_escape(cm['class_name'])}, " + f"'NODE_ID', 'NAME')" + ) + else: + rows.append( + f" ({sql_escape(cm['class_name'])}, {sql_escape(src_db)}, {sql_escape(src_schema)}, " + f"{sql_escape(src_tbl)}, {sql_escape(cm.get('filter_condition'))}, NULL, " + f"{sql_escape(cm['id_column'])}, {sql_escape(cm.get('name_column'))})" + ) + lines.append(",\n".join(rows)) + lines.append("AS t(CLASS_NAME, SOURCE_DATABASE, SOURCE_SCHEMA, SOURCE_TABLE, FILTER_COL, FILTER_VAL, ID_EXPR, NAME_EXPR);") + lines.append("") + + # ONT_REL_MAP + lines.append("CREATE TABLE IF NOT EXISTS ONT_REL_MAP (") + lines.append(" MAP_ID STRING DEFAULT UUID_STRING() PRIMARY KEY,") + lines.append(" REL_NAME STRING NOT NULL,") + lines.append(" SOURCE_DATABASE STRING NOT NULL,") + lines.append(" SOURCE_SCHEMA STRING NOT NULL,") + lines.append(" SOURCE_TABLE STRING NOT NULL,") + lines.append(" SRC_ID_EXPR STRING NOT NULL,") + lines.append(" DST_ID_EXPR STRING NOT NULL,") + lines.append(" FILTER_COL STRING,") + lines.append(" FILTER_VAL STRING,") + lines.append(f" ONTOLOGY_NAME STRING DEFAULT {sql_escape(ontology_name)}") + lines.append(");\n") + + rel_maps = mappings.get("relation_mappings", []) + if rel_maps: + lines.append("INSERT INTO ONT_REL_MAP (REL_NAME, SOURCE_DATABASE, SOURCE_SCHEMA, SOURCE_TABLE, SRC_ID_EXPR, DST_ID_EXPR, FILTER_COL, FILTER_VAL)") + lines.append("SELECT * FROM VALUES") + rows = [] + for rm in rel_maps: + src = rm["source_table"] + parts = src.split(".") + if len(parts) == 3: + src_db, src_schema, src_tbl = parts + else: + src_db, src_schema, src_tbl = database, schema, src + + if kg_path: + rows.append( + f" ({sql_escape(rm['rel_name'])}, {sql_escape(database)}, {sql_escape(schema)}, " + f"'KG_EDGE', 'SRC_ID', 'DST_ID', 'EDGE_TYPE', {sql_escape(rm['rel_name'])})" + ) + else: + src_col_val = rm.get("src_column") or rm.get("src_id_column", "SRC_ID") + dst_col_val = rm.get("dst_column") or rm.get("dst_id_column", "DST_ID") + rows.append( + f" ({sql_escape(rm['rel_name'])}, {sql_escape(src_db)}, {sql_escape(src_schema)}, " + f"{sql_escape(src_tbl)}, {sql_escape(src_col_val)}, {sql_escape(dst_col_val)}, " + f"NULL, NULL)" + ) + lines.append(",\n".join(rows)) + lines.append("AS t(REL_NAME, SOURCE_DATABASE, SOURCE_SCHEMA, SOURCE_TABLE, SRC_ID_EXPR, DST_ID_EXPR, FILTER_COL, FILTER_VAL);") + lines.append("") + + # ── ONT_ONTOLOGY ── + lines.append("-- Ontology Registry") + lines.append("CREATE TABLE IF NOT EXISTS ONT_ONTOLOGY (") + lines.append(" ONTOLOGY_NAME STRING PRIMARY KEY,") + lines.append(" DESCRIPTION STRING,") + lines.append(" VERSION STRING,") + lines.append(" DEFAULT_SCHEMA STRING,") + lines.append(" CREATED_BY STRING,") + lines.append(" CREATED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),") + lines.append(" IS_ACTIVE BOOLEAN DEFAULT TRUE") + lines.append(");\n") + lines.append(f"INSERT INTO ONT_ONTOLOGY (ONTOLOGY_NAME, VERSION, DESCRIPTION, DEFAULT_SCHEMA, CREATED_BY, IS_ACTIVE)") + lines.append(f"VALUES ({sql_escape(ontology_name)}, '1.0.0', {sql_escape(ontology_name + ' Ontology')}, {sql_escape(schema)}, 'SYSTEM', TRUE);\n") + + # ── ONT_OBJECT_SOURCE ── + lines.append("-- Object Source Mappings") + lines.append("CREATE TABLE IF NOT EXISTS ONT_OBJECT_SOURCE (") + lines.append(" ONTOLOGY_NAME STRING,") + lines.append(" OBJ_TYPE STRING,") + lines.append(" SOURCE_TABLE STRING,") + lines.append(" FILTER_SQL STRING,") + lines.append(" MAPPING VARIANT,") + lines.append(" PRIMARY KEY (ONTOLOGY_NAME, OBJ_TYPE, SOURCE_TABLE)") + lines.append(");\n") + + if class_maps: + for cm in class_maps: + cls_name = cm["class_name"] + if kg_path: + src_tbl = "KG_NODE" + filter_sql = f"NODE_TYPE = ''{cls_name.upper()}''" + # Build mapping JSON from columns + mapping_pairs = ['"NODE_ID": "id"', '"NAME": "name"'] + for col in cm.get("columns", []): + if not col.get("is_primary_key") and not col.get("is_foreign_key"): + col_lower = col["name"].lower() + if col_lower not in ("node_id", "node_type", "name", "props", "ts_ingested"): + mapping_pairs.append(f'"PROPS:{col["name"]}": "{col_lower}"') + mapping_json = "{" + ", ".join(mapping_pairs) + "}" + else: + src = cm["source_table"] + parts = src.split(".") + src_tbl = parts[-1] if parts else src + filter_sql = "" + mapping_pairs = [] + for col in cm.get("columns", []): + mapping_pairs.append(f'"{col["name"]}": "{col["name"].lower()}"') + mapping_json = "{" + ", ".join(mapping_pairs) + "}" + lines.append(f"INSERT INTO ONT_OBJECT_SOURCE (ONTOLOGY_NAME, OBJ_TYPE, SOURCE_TABLE, FILTER_SQL, MAPPING)") + lines.append(f"SELECT {sql_escape(ontology_name)}, {sql_escape(cls_name)}, {sql_escape(src_tbl)}, {sql_escape(filter_sql)}, PARSE_JSON('{mapping_json}');") + lines.append("") + + # ── ONT_LINK_SOURCE ── + lines.append("-- Link Source Mappings") + lines.append("CREATE TABLE IF NOT EXISTS ONT_LINK_SOURCE (") + lines.append(" ONTOLOGY_NAME STRING,") + lines.append(" LINK_TYPE STRING,") + lines.append(" SOURCE_TABLE STRING,") + lines.append(" FILTER_SQL STRING,") + lines.append(" MAPPING VARIANT,") + lines.append(" PRIMARY KEY (ONTOLOGY_NAME, LINK_TYPE, SOURCE_TABLE)") + lines.append(");\n") + + if rel_maps: + for rm in rel_maps: + rel_name = rm["rel_name"] + if kg_path: + src_tbl = "KG_EDGE" + filter_sql = f"EDGE_TYPE = ''{rel_name}''" + mapping_json = '{"SRC_ID": "src_id", "DST_ID": "dst_id"}' + else: + src = rm["source_table"] + parts = src.split(".") + src_tbl = parts[-1] if parts else src + filter_sql = "" + src_col_val = rm.get("src_column") or rm.get("src_id_column", "SRC_ID") + dst_col_val = rm.get("dst_column") or rm.get("dst_id_column", "DST_ID") + mapping_json = '{' + f'"SRC_ID": "{src_col_val}", "DST_ID": "{dst_col_val}"' + '}' + lines.append(f"INSERT INTO ONT_LINK_SOURCE (ONTOLOGY_NAME, LINK_TYPE, SOURCE_TABLE, FILTER_SQL, MAPPING)") + lines.append(f"SELECT {sql_escape(ontology_name)}, {sql_escape(rel_name)}, {sql_escape(src_tbl)}, {sql_escape(filter_sql)}, PARSE_JSON('{mapping_json}');") + lines.append("") + + # ── ONT_SHARED_PROPERTY ── + lines.append("-- Shared Properties") + lines.append("CREATE TABLE IF NOT EXISTS ONT_SHARED_PROPERTY (") + lines.append(" SHARED_PROP_NAME STRING PRIMARY KEY,") + lines.append(" BASE_TYPE STRING,") + lines.append(" DESCRIPTION STRING,") + lines.append(" DEFAULT_FORMAT STRING") + lines.append(");\n") + + # Auto-detect shared properties: columns that appear in 2+ classes with the same name + col_counts: dict[str, dict] = {} # col_name -> {type, count} + for cm in class_maps: + for col in cm.get("columns", []): + if col.get("is_primary_key") or col.get("is_foreign_key"): + continue + cn = col["name"].upper() + if cn not in col_counts: + col_counts[cn] = {"type": col["data_type"], "count": 0} + col_counts[cn]["count"] += 1 + shared_props = {k: v for k, v in col_counts.items() if v["count"] >= 2} + if shared_props: + lines.append("INSERT INTO ONT_SHARED_PROPERTY (SHARED_PROP_NAME, BASE_TYPE, DESCRIPTION) VALUES") + rows = [] + for sp_name, sp_info in shared_props.items(): + cast_t = _sf_cast_type(sp_info["type"]) + rows.append(f" ({sql_escape(sp_name.lower())}, {sql_escape(cast_t)}, {sql_escape(f'Shared property {sp_name.lower()}')})") + lines.append(",\n".join(rows) + ";\n") + + # ── ONT_PROPERTY ── + lines.append("-- Property Definitions") + lines.append("CREATE TABLE IF NOT EXISTS ONT_PROPERTY (") + lines.append(" CLASS_NAME STRING,") + lines.append(" PROP_NAME STRING,") + lines.append(" DATA_TYPE STRING,") + lines.append(" SHARED_PROP_NAME STRING,") + lines.append(" IS_REQUIRED BOOLEAN DEFAULT FALSE,") + lines.append(" IS_INDEXED BOOLEAN DEFAULT FALSE,") + lines.append(" DESCRIPTION STRING,") + lines.append(" PRIMARY KEY (CLASS_NAME, PROP_NAME)") + lines.append(");\n") + + # Auto-populate ONT_PROPERTY from column metadata + prop_rows = [] + for cm in class_maps: + cls_name = cm["class_name"] + for col in cm.get("columns", []): + if col.get("is_primary_key") or col.get("is_foreign_key"): + continue + cn = col["name"].upper() + if cn in ("NODE_ID", "NODE_TYPE", "NAME", "PROPS", "TS_INGESTED"): + continue + cast_t = _sf_cast_type(col["data_type"]) + sp = col["name"].lower() if cn in shared_props else None + is_req = not col.get("nullable", True) + col_name = col["name"] + prop_rows.append( + f" ({sql_escape(cls_name)}, {sql_escape(col['name'].lower())}, " + f"{sql_escape(cast_t)}, {sql_escape(sp)}, " + f"{str(is_req).upper()}, FALSE, {sql_escape(col_name + ' of ' + cls_name)})" + ) + if prop_rows: + lines.append("INSERT INTO ONT_PROPERTY (CLASS_NAME, PROP_NAME, DATA_TYPE, SHARED_PROP_NAME, IS_REQUIRED, IS_INDEXED, DESCRIPTION) VALUES") + lines.append(",\n".join(prop_rows) + ";\n") + + # ── ONT_DERIVED_PROPERTY ── + lines.append("-- Derived Properties") + lines.append("CREATE TABLE IF NOT EXISTS ONT_DERIVED_PROPERTY (") + lines.append(" CLASS_NAME STRING,") + lines.append(" PROP_NAME STRING,") + lines.append(" DEFINITION_KIND STRING,") + lines.append(" SQL_EXPR STRING,") + lines.append(" FUNCTION_NAME STRING,") + lines.append(" DESCRIPTION STRING,") + lines.append(" PRIMARY KEY (CLASS_NAME, PROP_NAME)") + lines.append(");\n") + + # ── ONT_INTERFACE ── + lines.append("-- Interfaces (Polymorphism)") + lines.append("CREATE TABLE IF NOT EXISTS ONT_INTERFACE (") + lines.append(" INTERFACE_NAME STRING PRIMARY KEY,") + lines.append(" DESCRIPTION STRING") + lines.append(");\n") + + # ── ONT_INTERFACE_PROPERTY ── + lines.append("CREATE TABLE IF NOT EXISTS ONT_INTERFACE_PROPERTY (") + lines.append(" INTERFACE_NAME STRING,") + lines.append(" PROP_NAME STRING,") + lines.append(" SHARED_PROP_NAME STRING,") + lines.append(" PRIMARY KEY (INTERFACE_NAME, PROP_NAME)") + lines.append(");\n") + + # ── ONT_INTERFACE_IMPL ── + lines.append("CREATE TABLE IF NOT EXISTS ONT_INTERFACE_IMPL (") + lines.append(" INTERFACE_NAME STRING,") + lines.append(" CLASS_NAME STRING,") + lines.append(" PRIMARY KEY (INTERFACE_NAME, CLASS_NAME)") + lines.append(");\n") + + # Auto-populate interfaces from shared properties + if shared_props: + # Create one interface per shared property pattern (Named, Temporal, etc.) + # Group classes by shared property membership + interface_rows = [] + impl_rows = [] + prop_iface_rows = [] + for sp_name in shared_props: + iface_name = f"Has{sp_name.replace('_',' ').title().replace(' ','')}" + interface_rows.append(f" ({sql_escape(iface_name)}, {sql_escape(f'Entities with {sp_name.lower()} property')})") + prop_iface_rows.append(f" ({sql_escape(iface_name)}, {sql_escape(sp_name.lower())}, {sql_escape(sp_name.lower())})") + for cm in class_maps: + for col in cm.get("columns", []): + if col["name"].upper() == sp_name: + impl_rows.append(f" ({sql_escape(iface_name)}, {sql_escape(cm['class_name'])})") + break + if interface_rows: + lines.append("INSERT INTO ONT_INTERFACE (INTERFACE_NAME, DESCRIPTION) VALUES") + lines.append(",\n".join(interface_rows) + ";\n") + if prop_iface_rows: + lines.append("INSERT INTO ONT_INTERFACE_PROPERTY (INTERFACE_NAME, PROP_NAME, SHARED_PROP_NAME) VALUES") + lines.append(",\n".join(prop_iface_rows) + ";\n") + if impl_rows: + lines.append("INSERT INTO ONT_INTERFACE_IMPL (INTERFACE_NAME, CLASS_NAME) VALUES") + lines.append(",\n".join(impl_rows) + ";\n") + + # ── ONT_RULE ── + lines.append("-- Inference Rules") + lines.append("CREATE TABLE IF NOT EXISTS ONT_RULE (") + lines.append(" RULE_ID STRING PRIMARY KEY,") + lines.append(" RULE_KIND STRING,") + lines.append(" TARGET_REL STRING,") + lines.append(" SOURCE_REL_1 STRING,") + lines.append(" SOURCE_REL_2 STRING,") + lines.append(" INVERSE_OF STRING,") + lines.append(" DESCRIPTION STRING,") + lines.append(" IS_ENABLED BOOLEAN DEFAULT TRUE,") + lines.append(" TS_CREATED TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()") + lines.append(");\n") + + # Auto-populate inverse rules from relation definitions that have inverse_name + rule_rows = [] + rule_idx = 0 + for rel in relations: + inv = rel.get("inverse_name") + if inv: + rule_idx += 1 + rel_name_val = rel.get("rel_name") or rel.get("name") + rule_rows.append( + f" ({sql_escape(f'RULE_INV_{rule_idx:03d}')}, 'INVERSE', {sql_escape(inv)}, " + f"NULL, NULL, {sql_escape(rel_name_val)}, TRUE, " + f"{sql_escape('Infer ' + inv + ' from ' + rel_name_val)})" + ) + if rule_rows: + lines.append("INSERT INTO ONT_RULE (RULE_ID, RULE_KIND, TARGET_REL, SOURCE_REL_1, SOURCE_REL_2, INVERSE_OF, IS_ENABLED, DESCRIPTION) VALUES") + lines.append(",\n".join(rule_rows) + ";\n") + + # ── REL_EDGE_INFERRED ── + lines.append("-- Inferred Edges") + lines.append("CREATE TABLE IF NOT EXISTS REL_EDGE_INFERRED (") + lines.append(" REL_NAME STRING NOT NULL,") + lines.append(" SRC_ID STRING NOT NULL,") + lines.append(" DST_ID STRING NOT NULL,") + lines.append(" INFERENCE_KIND STRING,") + lines.append(" RULE_ID STRING,") + lines.append(" WEIGHT FLOAT DEFAULT 1.0,") + lines.append(" EFFECTIVE_START DATE,") + lines.append(" EFFECTIVE_END DATE,") + lines.append(" COMPUTED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),") + lines.append(" PRIMARY KEY (REL_NAME, SRC_ID, DST_ID, RULE_ID)") + lines.append(");\n") + + # ── ONT_CONSTRAINT_VIOLATION ── + lines.append("-- Data Quality Constraints") + lines.append("CREATE TABLE IF NOT EXISTS ONT_CONSTRAINT_VIOLATION (") + lines.append(" VIOLATION_ID STRING DEFAULT UUID_STRING(),") + lines.append(" CHECK_NAME STRING,") + lines.append(" SCOPE STRING,") + lines.append(" REL_OR_CLASS STRING,") + lines.append(" SRC_ID STRING,") + lines.append(" DST_ID STRING,") + lines.append(" DETAILS STRING,") + lines.append(" OBSERVED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),") + lines.append(" PRIMARY KEY (VIOLATION_ID)") + lines.append(");\n") + + # ── ACT_TYPE ── + lines.append("-- Action Types") + lines.append("CREATE TABLE IF NOT EXISTS ACT_TYPE (") + lines.append(" ACTION_TYPE_ID STRING PRIMARY KEY,") + lines.append(" ACTION_NAME STRING NOT NULL,") + lines.append(" DESCRIPTION STRING,") + lines.append(" ONTOLOGY_NAME STRING,") + lines.append(" TARGET_CLASS STRING,") + lines.append(" HANDLER_PROC STRING,") + lines.append(" IS_ENABLED BOOLEAN DEFAULT TRUE,") + lines.append(" TS_CREATED TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()") + lines.append(");\n") + + # ── ACT_DEF ── + lines.append("-- Action Parameter Definitions") + lines.append("CREATE TABLE IF NOT EXISTS ACT_DEF (") + lines.append(" ACTION_TYPE_ID STRING,") + lines.append(" PARAM_NAME STRING,") + lines.append(" PARAM_TYPE STRING,") + lines.append(" IS_REQUIRED BOOLEAN DEFAULT FALSE,") + lines.append(" DESCRIPTION STRING,") + lines.append(" PRIMARY KEY (ACTION_TYPE_ID, PARAM_NAME)") + lines.append(");\n") + + # ── ACT_INVOCATION ── + lines.append("-- Action Invocation Log") + lines.append("CREATE TABLE IF NOT EXISTS ACT_INVOCATION (") + lines.append(" INVOCATION_ID STRING PRIMARY KEY,") + lines.append(" ACTION_TYPE_ID STRING NOT NULL,") + lines.append(" TARGET_OBJECT_ID STRING,") + lines.append(" PARAMS VARIANT,") + lines.append(" STATUS STRING,") + lines.append(" RESULT_MSG STRING,") + lines.append(" INVOKED_BY STRING,") + lines.append(" INVOKED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),") + lines.append(" COMPLETED_AT TIMESTAMP_NTZ") + lines.append(");\n") + + # ── ONT_FUNCTION ── + lines.append("-- Function Catalog") + lines.append("CREATE TABLE IF NOT EXISTS ONT_FUNCTION (") + lines.append(" FUNCTION_NAME STRING,") + lines.append(" VERSION STRING,") + lines.append(" LANGUAGE STRING,") + lines.append(" SNOWFLAKE_REF STRING,") + lines.append(" DESCRIPTION STRING,") + lines.append(" INPUT_SCHEMA VARIANT,") + lines.append(" OUTPUT_SCHEMA VARIANT,") + lines.append(" ONTOLOGY_NAME STRING,") + lines.append(" PRIMARY KEY (ONTOLOGY_NAME, FUNCTION_NAME, VERSION)") + lines.append(");\n") + + # ── ONT_FUNCTION_BINDING ── + lines.append("-- Function Bindings") + lines.append("CREATE TABLE IF NOT EXISTS ONT_FUNCTION_BINDING (") + lines.append(" ONTOLOGY_NAME STRING,") + lines.append(" FUNCTION_NAME STRING,") + lines.append(" VERSION STRING,") + lines.append(" BOUND_TO_KIND STRING,") + lines.append(" BOUND_TO_NAME STRING") + lines.append(");\n") + + # ── OBJ_VIEW_DEF ── + lines.append("-- Object View Definitions (UI/Governance)") + lines.append("CREATE TABLE IF NOT EXISTS OBJ_VIEW_DEF (") + lines.append(" OBJ_TYPE STRING,") + lines.append(" VIEW_NAME STRING,") + lines.append(" CREATED_BY STRING,") + lines.append(" DESCRIPTION STRING,") + lines.append(" DISPLAY_COLS VARIANT,") + lines.append(" VERSION STRING DEFAULT '1.0',") + lines.append(" STATUS STRING DEFAULT 'ACTIVE',") + lines.append(" TS_CREATED TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),") + lines.append(" PRIMARY KEY (OBJ_TYPE, VIEW_NAME)") + lines.append(");\n") + + # Auto-populate OBJ_VIEW_DEF from class mappings + for cm in class_maps: + cls_name = cm["class_name"] + view_name = f"V_{cls_name.upper()}" + display_cols = [ + c["name"].upper() for c in cm.get("columns", []) + if not c.get("is_primary_key") and not c.get("is_foreign_key") + and c["name"].upper() not in ("NODE_ID", "NODE_TYPE", "PROPS", "TS_INGESTED") + ][:5] # Limit to 5 display columns + cols_array = ", ".join(f"'{c}'" for c in display_cols) + lines.append(f"INSERT INTO OBJ_VIEW_DEF (OBJ_TYPE, VIEW_NAME, CREATED_BY, DESCRIPTION, DISPLAY_COLS)") + lines.append(f"SELECT {sql_escape(cls_name)}, {sql_escape(view_name)}, 'SYSTEM', {sql_escape(f'Standard {cls_name} view')}, ARRAY_CONSTRUCT({cols_array});") + lines.append("") + + # ── OBJ_VIEW_FIELD ── + lines.append("-- Object View Fields") + lines.append("CREATE TABLE IF NOT EXISTS OBJ_VIEW_FIELD (") + lines.append(" OBJ_TYPE STRING,") + lines.append(" VIEW_NAME STRING,") + lines.append(" VERSION STRING DEFAULT '1.0',") + lines.append(" FIELD_ORDER NUMBER,") + lines.append(" PROP_NAME STRING,") + lines.append(" RENDER_HINT STRING,") + lines.append(" PRIMARY KEY (OBJ_TYPE, VIEW_NAME, VERSION, PROP_NAME)") + lines.append(");\n") + + # ── ONT_ROLE ── + lines.append("-- Roles and Permissions") + lines.append("CREATE TABLE IF NOT EXISTS ONT_ROLE (") + lines.append(" ONTOLOGY_NAME STRING,") + lines.append(" ONT_ROLE_NAME STRING,") + lines.append(" DESCRIPTION STRING,") + lines.append(" PRIMARY KEY (ONTOLOGY_NAME, ONT_ROLE_NAME)") + lines.append(");\n") + + # Default roles + lines.append(f"INSERT INTO ONT_ROLE (ONTOLOGY_NAME, ONT_ROLE_NAME, DESCRIPTION) VALUES") + lines.append(f" ({sql_escape(ontology_name)}, 'viewer', 'Read-only access to all entities'),") + lines.append(f" ({sql_escape(ontology_name)}, 'analyst', 'Read access plus analytics functions'),") + lines.append(f" ({sql_escape(ontology_name)}, 'editor', 'Read and write access to entities'),") + lines.append(f" ({sql_escape(ontology_name)}, 'admin', 'Full administrative access');\n") + + # ── ONT_ROLE_BINDING ── + lines.append("CREATE TABLE IF NOT EXISTS ONT_ROLE_BINDING (") + lines.append(" ONTOLOGY_NAME STRING,") + lines.append(" ONT_ROLE_NAME STRING,") + lines.append(" SNOWFLAKE_ROLE STRING,") + lines.append(" PRIMARY KEY (ONTOLOGY_NAME, ONT_ROLE_NAME, SNOWFLAKE_ROLE)") + lines.append(");\n") + + # ── ONT_PERMISSION ── + lines.append("CREATE TABLE IF NOT EXISTS ONT_PERMISSION (") + lines.append(" ONTOLOGY_NAME STRING,") + lines.append(" SUBJECT_KIND STRING,") + lines.append(" SUBJECT_NAME STRING,") + lines.append(" ONT_ROLE_NAME STRING,") + lines.append(" PRIVILEGE STRING,") + lines.append(" PRIMARY KEY (ONTOLOGY_NAME, SUBJECT_KIND, SUBJECT_NAME, ONT_ROLE_NAME, PRIVILEGE)") + lines.append(");\n") + + # Auto-populate viewer READ permissions for all classes + perm_rows = [] + for cm in class_maps: + cls_name = cm["class_name"] + perm_rows.append( + f" ({sql_escape(ontology_name)}, 'OBJECT_TYPE', {sql_escape(cls_name)}, 'viewer', 'READ')" + ) + perm_rows.append( + f" ({sql_escape(ontology_name)}, 'OBJECT_TYPE', {sql_escape(cls_name)}, 'admin', 'ADMIN')" + ) + if perm_rows: + lines.append("INSERT INTO ONT_PERMISSION (ONTOLOGY_NAME, SUBJECT_KIND, SUBJECT_NAME, ONT_ROLE_NAME, PRIVILEGE) VALUES") + lines.append(",\n".join(perm_rows) + ";\n") + + return "\n".join(lines) + + +def generate_views_sql( + classes: list[dict], + mappings: dict, + database: str, + schema: str, + ontology_name: str, + kg_path: bool, +) -> str: + """Generate abstract ontology views SQL.""" + fqn = f"{database}.{schema}" + lines = [] + lines.append(f"-- {'='*76}") + lines.append(f"-- Layer 3: Abstract Ontology Views for {ontology_name}") + lines.append(f"-- Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}") + lines.append(f"-- {'='*76}\n") + lines.append(f"USE SCHEMA {fqn};\n") + + class_maps = mappings.get("class_mappings", []) + + # Per-class entity views + view_names = [] + for cm in class_maps: + cls_name = cm["class_name"] + view_name = f"VW_ONT_{cls_name.upper()}" + view_names.append(view_name) + + if kg_path: + lines.append(f"-- {view_name}: Entity view for {cls_name} (KG path)") + lines.append(f"CREATE OR REPLACE VIEW {view_name} AS") + lines.append(f"SELECT") + lines.append(f" NODE_ID AS ENTITY_ID,") + lines.append(f" NODE_TYPE AS ENTITY_TYPE,") + lines.append(f" NAME AS ENTITY_NAME,") + lines.append(f" PROPS") + lines.append(f"FROM KG_NODE") + lines.append(f"WHERE NODE_TYPE = '{cls_name}';") + else: + src = cm["source_table"] + parts = src.split(".") + src_tbl = parts[-1] if "." in src else src + id_col = cm["id_column"] + name_col = cm.get("name_column") or id_col + filt = cm.get("filter_condition") + + lines.append(f"-- {view_name}: Entity view for {cls_name} (direct table path)") + lines.append(f"CREATE OR REPLACE VIEW {view_name} AS") + lines.append(f"SELECT") + lines.append(f" {id_col}::STRING AS ENTITY_ID,") + lines.append(f" '{cls_name}' AS ENTITY_TYPE,") + lines.append(f" {name_col}::STRING AS ENTITY_NAME,") + lines.append(f" OBJECT_CONSTRUCT(*) AS PROPS") + lines.append(f"FROM {src}") + if filt: + lines.append(f"WHERE {filt}") + lines.append(f";") + lines.append("") + + # Unified entity view + if len(view_names) > 1: + lines.append("-- VW_ONT_ALL_ENTITIES: Unified view of all entity types") + lines.append("CREATE OR REPLACE VIEW VW_ONT_ALL_ENTITIES AS") + for i, vn in enumerate(view_names): + if i > 0: + lines.append("UNION ALL") + lines.append(f"SELECT ENTITY_ID, ENTITY_TYPE, ENTITY_NAME, PROPS FROM {vn}") + lines.append(";\n") + + # Resolved relationships view + if kg_path: + lines.append("-- REL_RESOLVED: Resolved relationships with node names") + lines.append("CREATE OR REPLACE VIEW REL_RESOLVED AS") + lines.append("SELECT") + lines.append(" e.EDGE_TYPE AS REL_NAME,") + lines.append(" e.SRC_ID,") + lines.append(" src.NAME AS SRC_NAME,") + lines.append(" src.NODE_TYPE AS SRC_TYPE,") + lines.append(" e.DST_ID,") + lines.append(" dst.NAME AS DST_NAME,") + lines.append(" dst.NODE_TYPE AS DST_TYPE,") + lines.append(" e.WEIGHT,") + lines.append(" e.EFFECTIVE_START,") + lines.append(" e.EFFECTIVE_END") + lines.append("FROM KG_EDGE e") + lines.append("LEFT JOIN KG_NODE src ON e.SRC_ID = src.NODE_ID") + lines.append("LEFT JOIN KG_NODE dst ON e.DST_ID = dst.NODE_ID;\n") + else: + # For direct-table path, create REL_RESOLVED from relation mappings + rel_maps = mappings.get("relation_mappings", []) + if rel_maps: + lines.append("-- REL_RESOLVED: Resolved relationships (direct table path)") + lines.append("CREATE OR REPLACE VIEW REL_RESOLVED AS") + for i, rm in enumerate(rel_maps): + if i > 0: + lines.append("UNION ALL") + src_tbl = rm["source_table"] + lines.append(f"SELECT") + lines.append(f" '{rm['rel_name']}' AS REL_NAME,") + lines.append(f" {rm['src_column']}::STRING AS SRC_ID,") + lines.append(f" NULL AS SRC_NAME,") + lines.append(f" NULL AS SRC_TYPE,") + lines.append(f" {rm['dst_column']}::STRING AS DST_ID,") + lines.append(f" NULL AS DST_NAME,") + lines.append(f" NULL AS DST_TYPE,") + lines.append(f" 1.0 AS WEIGHT,") + lines.append(f" NULL AS EFFECTIVE_START,") + lines.append(f" NULL AS EFFECTIVE_END") + lines.append(f"FROM {src_tbl}") + lines.append(";\n") + + return "\n".join(lines) + + +def generate_view_generator_sp(database: str, schema: str, ontology_name: str) -> str: + """Generate the SP_GENERATE_ONTOLOGY_VIEWS stored procedure.""" + fqn = f"{database}.{schema}" + return f"""-- ============================================================================ +-- View Generator Stored Procedure +-- Reads ONT_CLASS_MAP and regenerates VW_ONT_* views dynamically +-- ============================================================================ + +USE SCHEMA {fqn}; + +CREATE OR REPLACE PROCEDURE SP_GENERATE_ONTOLOGY_VIEWS() +RETURNS STRING +LANGUAGE PYTHON +RUNTIME_VERSION = '3.12' +PACKAGES = ('snowflake-snowpark-python') +HANDLER = 'generate_views' +AS +$$ +def generate_views(session): + import json + + # Read class mappings + maps_df = session.sql("SELECT * FROM ONT_CLASS_MAP").collect() + + views_created = [] + union_parts = [] + + for row in maps_df: + cls_name = row['CLASS_NAME'] + src_db = row['SOURCE_DATABASE'] + src_schema = row['SOURCE_SCHEMA'] + src_table = row['SOURCE_TABLE'] + filter_col = row.get('FILTER_COL') + filter_val = row.get('FILTER_VAL') + id_expr = row['ID_EXPR'] + name_expr = row.get('NAME_EXPR') or id_expr + + view_name = f"VW_ONT_{{cls_name.upper()}}" + fqn_src = f"{{src_db}}.{{src_schema}}.{{src_table}}" + + where_clause = "" + if filter_col and filter_val: + where_clause = f"WHERE {{filter_col}} = '{{filter_val}}'" + + view_sql = f\"\"\" +CREATE OR REPLACE VIEW {{view_name}} AS +SELECT + {{id_expr}}::STRING AS ENTITY_ID, + '{{cls_name}}' AS ENTITY_TYPE, + {{name_expr}}::STRING AS ENTITY_NAME, + OBJECT_CONSTRUCT(*) AS PROPS +FROM {{fqn_src}} +{{where_clause}} +\"\"\" + session.sql(view_sql.strip()).collect() + views_created.append(view_name) + + union_parts.append( + f"SELECT ENTITY_ID, ENTITY_TYPE, ENTITY_NAME, PROPS FROM {{view_name}}" + ) + + # Create unified view + if len(union_parts) > 1: + union_sql = "CREATE OR REPLACE VIEW VW_ONT_ALL_ENTITIES AS\\n" + "\\nUNION ALL\\n".join(union_parts) + session.sql(union_sql).collect() + views_created.append("VW_ONT_ALL_ENTITIES") + + return json.dumps({{"views_created": views_created, "count": len(views_created)}}) +$$; + +-- Run it once to generate initial views +CALL SP_GENERATE_ONTOLOGY_VIEWS(); +""" + + +def generate_inference_sql(database: str, schema: str) -> str: + """Generate optional inference engine stored procedures (KG path only). + + Produces 06_inference_engine.sql with: + - SP_INFER_TRANSITIVE: recursive transitive closure + - SP_INFER_INVERSE: inverse relationship materialisation + - SP_RUN_ONTOLOGY_INFERENCE: master runner for all enabled rules + - SP_CHECK_CARDINALITY_SINGLE: cardinality constraint checker + - SP_CHECK_REFERENTIAL: referential integrity checker + """ + return f"""-- ============================================================= +-- INFERENCE ENGINE & CONSTRAINT PROCEDURES +-- Optional stored procedures for ontology inference and data quality +-- Generated by ontology-stack-builder +-- ============================================================= + +USE DATABASE {database}; +USE SCHEMA {schema}; + +-- ============================================================= +-- SP_INFER_TRANSITIVE +-- Computes transitive closure for a relationship type. +-- If A->B and B->C via the same rel, infers A->C. +-- ============================================================= +CREATE OR REPLACE PROCEDURE SP_INFER_TRANSITIVE(TARGET_REL STRING, RULE_ID STRING) +RETURNS STRING +LANGUAGE PYTHON +RUNTIME_VERSION = '3.12' +PACKAGES = ('snowflake-snowpark-python') +HANDLER = 'infer_transitive' +AS +$$ +def infer_transitive(session, target_rel, rule_id): + session.sql(f\"\"\" + DELETE FROM REL_EDGE_INFERRED WHERE RULE_ID = '{{rule_id}}' + \"\"\").collect() + + infer_sql = f\"\"\" + INSERT INTO REL_EDGE_INFERRED (REL_NAME, SRC_ID, DST_ID, INFERENCE_KIND, RULE_ID, WEIGHT) + WITH RECURSIVE transitive(src, dst, depth) AS ( + SELECT SRC_ID, DST_ID, 1 + FROM KG_EDGE + WHERE EDGE_TYPE = '{{target_rel}}' + UNION ALL + SELECT t.src, e.DST_ID, t.depth + 1 + FROM transitive t + JOIN KG_EDGE e ON t.dst = e.SRC_ID AND e.EDGE_TYPE = '{{target_rel}}' + WHERE t.depth < 5 AND t.src != e.DST_ID + ) + SELECT DISTINCT + '{{target_rel}}', src, dst, 'TRANSITIVE', '{{rule_id}}', 1.0 / depth + FROM transitive + WHERE (src, dst) NOT IN ( + SELECT SRC_ID, DST_ID FROM KG_EDGE WHERE EDGE_TYPE = '{{target_rel}}' + ) + \"\"\" + session.sql(infer_sql).collect() + count = session.sql(f\"\"\" + SELECT COUNT(*) AS cnt FROM REL_EDGE_INFERRED WHERE RULE_ID = '{{rule_id}}' + \"\"\").collect()[0]['CNT'] + return f"Inferred {{count}} transitive edges for {{target_rel}}" +$$; + +-- ============================================================= +-- SP_INFER_INVERSE +-- Creates inverse relationships based on ONT_RELATION_DEF.INVERSE_REL_NAME +-- ============================================================= +CREATE OR REPLACE PROCEDURE SP_INFER_INVERSE(RULE_ID STRING) +RETURNS STRING +LANGUAGE PYTHON +RUNTIME_VERSION = '3.12' +PACKAGES = ('snowflake-snowpark-python') +HANDLER = 'infer_inverse' +AS +$$ +def infer_inverse(session, rule_id): + session.sql(f\"\"\" + DELETE FROM REL_EDGE_INFERRED WHERE RULE_ID = '{{rule_id}}' + \"\"\").collect() + + rels = session.sql(\"\"\" + SELECT REL_NAME, INVERSE_REL_NAME + FROM ONT_RELATION_DEF + WHERE INVERSE_REL_NAME IS NOT NULL AND STATUS = 'ACTIVE' + \"\"\").collect() + + total = 0 + for rel in rels: + rel_name = rel['REL_NAME'] + inverse_name = rel['INVERSE_REL_NAME'] + session.sql(f\"\"\" + INSERT INTO REL_EDGE_INFERRED + (REL_NAME, SRC_ID, DST_ID, INFERENCE_KIND, RULE_ID, WEIGHT, EFFECTIVE_START, EFFECTIVE_END) + SELECT + '{{inverse_name}}', DST_ID, SRC_ID, 'INVERSE', '{{rule_id}}', + WEIGHT, EFFECTIVE_START, EFFECTIVE_END + FROM KG_EDGE + WHERE EDGE_TYPE = '{{rel_name}}' + \"\"\").collect() + cnt = session.sql(f\"\"\" + SELECT COUNT(*) AS cnt FROM REL_EDGE_INFERRED + WHERE RULE_ID = '{{rule_id}}' AND REL_NAME = '{{inverse_name}}' + \"\"\").collect()[0]['CNT'] + total += cnt + + return f"Inferred {{total}} inverse edges" +$$; + +-- ============================================================= +-- SP_RUN_ONTOLOGY_INFERENCE +-- Master procedure — runs all enabled inference rules in order +-- ============================================================= +CREATE OR REPLACE PROCEDURE SP_RUN_ONTOLOGY_INFERENCE() +RETURNS STRING +LANGUAGE PYTHON +RUNTIME_VERSION = '3.12' +PACKAGES = ('snowflake-snowpark-python') +HANDLER = 'run_inference' +AS +$$ +def run_inference(session): + rules = session.sql(\"\"\" + SELECT RULE_ID, RULE_KIND, TARGET_REL + FROM ONT_RULE + WHERE IS_ENABLED = TRUE + ORDER BY CASE RULE_KIND + WHEN 'INVERSE' THEN 1 + WHEN 'TRANSITIVE' THEN 2 + WHEN 'PROPERTY_CHAIN' THEN 3 + END + \"\"\").collect() + + results = [] + for rule in rules: + rule_id = rule['RULE_ID'] + kind = rule['RULE_KIND'] + try: + if kind == 'INVERSE': + r = session.call('SP_INFER_INVERSE', rule_id) + elif kind == 'TRANSITIVE': + r = session.call('SP_INFER_TRANSITIVE', rule['TARGET_REL'], rule_id) + else: + r = f"Unsupported rule kind: {{kind}}" + results.append(f"{{rule_id}}: {{r}}") + except Exception as e: + results.append(f"{{rule_id}}: ERROR - {{str(e)}}") + return "\\n".join(results) +$$; + +-- ============================================================= +-- SP_CHECK_CARDINALITY_SINGLE +-- Checks that a 1:1 or N:1 relationship has at most one edge per source +-- ============================================================= +CREATE OR REPLACE PROCEDURE SP_CHECK_CARDINALITY_SINGLE(REL STRING, CHECK_NAME STRING) +RETURNS STRING +LANGUAGE PYTHON +RUNTIME_VERSION = '3.12' +PACKAGES = ('snowflake-snowpark-python') +HANDLER = 'check_cardinality' +AS +$$ +def check_cardinality(session, rel, check_name): + session.sql(f\"\"\" + INSERT INTO ONT_CONSTRAINT_VIOLATION (CHECK_NAME, SCOPE, REL_OR_CLASS, SRC_ID, DETAILS) + SELECT '{{check_name}}', 'RELATION', '{{rel}}', SRC_ID, + 'Multiple edges from same source: ' || COUNT(*) || ' edges' + FROM KG_EDGE + WHERE EDGE_TYPE = '{{rel}}' + AND (EFFECTIVE_END IS NULL OR EFFECTIVE_END >= CURRENT_DATE()) + GROUP BY SRC_ID HAVING COUNT(*) > 1 + \"\"\").collect() + cnt = session.sql(f\"\"\" + SELECT COUNT(*) AS cnt FROM ONT_CONSTRAINT_VIOLATION WHERE CHECK_NAME = '{{check_name}}' + \"\"\").collect()[0]['CNT'] + return f"Found {{cnt}} cardinality violations for {{rel}}" +$$; + +-- ============================================================= +-- SP_CHECK_REFERENTIAL +-- Checks that all edge endpoints reference existing nodes +-- ============================================================= +CREATE OR REPLACE PROCEDURE SP_CHECK_REFERENTIAL(REL STRING, CHECK_NAME STRING) +RETURNS STRING +LANGUAGE PYTHON +RUNTIME_VERSION = '3.12' +PACKAGES = ('snowflake-snowpark-python') +HANDLER = 'check_referential' +AS +$$ +def check_referential(session, rel, check_name): + session.sql(f\"\"\" + INSERT INTO ONT_CONSTRAINT_VIOLATION (CHECK_NAME, SCOPE, REL_OR_CLASS, SRC_ID, DETAILS) + SELECT '{{check_name}}', 'RELATION', '{{rel}}', e.SRC_ID, 'Source node not found' + FROM KG_EDGE e LEFT JOIN KG_NODE n ON e.SRC_ID = n.NODE_ID + WHERE e.EDGE_TYPE = '{{rel}}' AND n.NODE_ID IS NULL + \"\"\").collect() + session.sql(f\"\"\" + INSERT INTO ONT_CONSTRAINT_VIOLATION (CHECK_NAME, SCOPE, REL_OR_CLASS, DST_ID, DETAILS) + SELECT '{{check_name}}', 'RELATION', '{{rel}}', e.DST_ID, 'Destination node not found' + FROM KG_EDGE e LEFT JOIN KG_NODE n ON e.DST_ID = n.NODE_ID + WHERE e.EDGE_TYPE = '{{rel}}' AND n.NODE_ID IS NULL + \"\"\").collect() + cnt = session.sql(f\"\"\" + SELECT COUNT(*) AS cnt FROM ONT_CONSTRAINT_VIOLATION WHERE CHECK_NAME = '{{check_name}}' + \"\"\").collect()[0]['CNT'] + return f"Found {{cnt}} referential integrity violations for {{rel}}" +$$; + +COMMENT ON PROCEDURE SP_INFER_TRANSITIVE(STRING, STRING) IS 'Computes transitive closure for a relationship type'; +COMMENT ON PROCEDURE SP_INFER_INVERSE(STRING) IS 'Creates inverse relationships based on ontology definitions'; +COMMENT ON PROCEDURE SP_RUN_ONTOLOGY_INFERENCE() IS 'Master procedure to run all enabled inference rules'; +COMMENT ON PROCEDURE SP_CHECK_CARDINALITY_SINGLE(STRING, STRING) IS 'Checks cardinality constraints for relationships'; +COMMENT ON PROCEDURE SP_CHECK_REFERENTIAL(STRING, STRING) IS 'Checks referential integrity for edges'; +""" + + +def generate_graph_traversal_sql(database: str, schema: str) -> str: + """Generate generic SQL UDF graph traversal tools for KG_NODE/KG_EDGE. + + Produces 07_graph_traversal_tools.sql with 4 parameterised UDFs: + - EXPAND_DESCENDANTS_TOOL: recursive downward traversal + - GET_ANCESTORS_TOOL: recursive upward traversal + - GET_HIERARCHY_PATH_TOOL: path between two concepts + - GET_DIRECT_CHILDREN_TOOL: single-hop children + + These operate purely on KG_NODE/KG_EDGE and work for any ontology. + """ + fqn = f"{database}.{schema}" + return f"""-- ============================================================================ +-- 07_graph_traversal_tools.sql +-- Generic SQL UDF Graph Traversal Tools for Cortex Agent +-- Generated by ontology-stack-builder +-- ============================================================================ +-- These UDFs operate on KG_NODE / KG_EDGE and work with any ontology. +-- They are registered as "generic" tools in the Cortex Agent, enabling +-- hierarchy traversal queries that semantic views cannot handle well +-- (e.g., recursive descendant expansion with a runtime root concept). +-- ============================================================================ + +USE SCHEMA {fqn}; + +-- ============================================================================ +-- EXPAND_DESCENDANTS_TOOL: Get all descendants of a root concept +-- ============================================================================ +-- Given a concept name, recursively walks subClassOf edges downward to +-- return all descendant concepts with depth and path. +-- +-- Example: SELECT * FROM TABLE(EXPAND_DESCENDANTS_TOOL('Vehicle')); +-- ============================================================================ + +CREATE OR REPLACE FUNCTION EXPAND_DESCENDANTS_TOOL(ROOT_CONCEPT VARCHAR) +RETURNS TABLE ( + NODE_ID VARCHAR, + NODE_NAME VARCHAR, + DEPTH NUMBER, + PATH VARCHAR +) +LANGUAGE SQL +AS +$$ +WITH RECURSIVE +root_node AS ( + SELECT NODE_ID, NAME + FROM {fqn}.KG_NODE + WHERE LOWER(NAME) = LOWER(ROOT_CONCEPT) + LIMIT 1 +), +descendants AS ( + SELECT + e.SRC_ID AS NODE_ID, + n.NAME AS NODE_NAME, + 1 AS DEPTH, + r.NAME || ' -> ' || n.NAME AS PATH + FROM {fqn}.KG_EDGE e + JOIN root_node r ON e.DST_ID = r.NODE_ID + JOIN {fqn}.KG_NODE n ON e.SRC_ID = n.NODE_ID + WHERE e.EDGE_TYPE = 'subClassOf' + + UNION ALL + + SELECT + e.SRC_ID AS NODE_ID, + n.NAME AS NODE_NAME, + d.DEPTH + 1 AS DEPTH, + d.PATH || ' -> ' || n.NAME AS PATH + FROM descendants d + JOIN {fqn}.KG_EDGE e ON d.NODE_ID = e.DST_ID + JOIN {fqn}.KG_NODE n ON e.SRC_ID = n.NODE_ID + WHERE e.EDGE_TYPE = 'subClassOf' + AND d.DEPTH < 15 +) +SELECT NODE_ID, NAME AS NODE_NAME, 0 AS DEPTH, NAME AS PATH +FROM root_node +UNION ALL +SELECT NODE_ID, NODE_NAME, DEPTH, PATH +FROM descendants +ORDER BY DEPTH, NODE_NAME +$$; + +COMMENT ON FUNCTION EXPAND_DESCENDANTS_TOOL(VARCHAR) IS +'Returns all descendants of a root concept in the ontology hierarchy. +Use for cohort expansion queries like "all subtypes of X". +Returns NODE_ID, NODE_NAME, DEPTH (distance from root), and PATH.'; + + +-- ============================================================================ +-- GET_ANCESTORS_TOOL: Get all ancestors of a concept +-- ============================================================================ +-- Given a concept name, recursively walks subClassOf edges upward to +-- return all ancestor concepts with depth (shortest path). +-- +-- Example: SELECT * FROM TABLE(GET_ANCESTORS_TOOL('Sedan')); +-- ============================================================================ + +CREATE OR REPLACE FUNCTION GET_ANCESTORS_TOOL(CONCEPT VARCHAR) +RETURNS TABLE ( + ANCESTOR_ID VARCHAR, + ANCESTOR_NAME VARCHAR, + DEPTH NUMBER +) +LANGUAGE SQL +AS +$$ +WITH RECURSIVE +start_node AS ( + SELECT NODE_ID, NAME + FROM {fqn}.KG_NODE + WHERE LOWER(NAME) = LOWER(CONCEPT) + LIMIT 1 +), +ancestors AS ( + SELECT + e.DST_ID AS ANCESTOR_ID, + n.NAME AS ANCESTOR_NAME, + 1 AS DEPTH + FROM start_node s + JOIN {fqn}.KG_EDGE e ON s.NODE_ID = e.SRC_ID + JOIN {fqn}.KG_NODE n ON e.DST_ID = n.NODE_ID + WHERE e.EDGE_TYPE = 'subClassOf' + + UNION ALL + + SELECT + e.DST_ID AS ANCESTOR_ID, + n.NAME AS ANCESTOR_NAME, + a.DEPTH + 1 AS DEPTH + FROM ancestors a + JOIN {fqn}.KG_EDGE e ON a.ANCESTOR_ID = e.SRC_ID + JOIN {fqn}.KG_NODE n ON e.DST_ID = n.NODE_ID + WHERE e.EDGE_TYPE = 'subClassOf' + AND a.DEPTH < 20 +) +SELECT DISTINCT ANCESTOR_ID, ANCESTOR_NAME, MIN(DEPTH) AS DEPTH +FROM ancestors +GROUP BY ANCESTOR_ID, ANCESTOR_NAME +ORDER BY DEPTH, ANCESTOR_NAME +$$; + +COMMENT ON FUNCTION GET_ANCESTORS_TOOL(VARCHAR) IS +'Returns all ancestors of a concept. Use for understanding where a concept sits in the hierarchy.'; + + +-- ============================================================================ +-- GET_HIERARCHY_PATH_TOOL: Find path between two concepts +-- ============================================================================ +-- Given a start and end concept, finds the shortest path through the +-- subClassOf hierarchy (traversing upward from start toward end). +-- +-- Example: SELECT * FROM TABLE(GET_HIERARCHY_PATH_TOOL('Sedan', 'Vehicle')); +-- ============================================================================ + +CREATE OR REPLACE FUNCTION GET_HIERARCHY_PATH_TOOL(START_CONCEPT VARCHAR, END_CONCEPT VARCHAR) +RETURNS TABLE ( + STEP NUMBER, + NODE_ID VARCHAR, + NODE_NAME VARCHAR, + RELATIONSHIP VARCHAR +) +LANGUAGE SQL +AS +$$ +WITH RECURSIVE +start_node AS ( + SELECT NODE_ID, NAME + FROM {fqn}.KG_NODE + WHERE LOWER(NAME) = LOWER(START_CONCEPT) + LIMIT 1 +), +end_node AS ( + SELECT NODE_ID, NAME + FROM {fqn}.KG_NODE + WHERE LOWER(NAME) = LOWER(END_CONCEPT) + LIMIT 1 +), +path_up AS ( + SELECT + s.NODE_ID, + s.NAME AS NODE_NAME, + 0 AS STEP, + ARRAY_CONSTRUCT(s.NODE_ID) AS VISITED + FROM start_node s + + UNION ALL + + SELECT + e.DST_ID AS NODE_ID, + n.NAME AS NODE_NAME, + p.STEP + 1 AS STEP, + ARRAY_APPEND(p.VISITED, e.DST_ID) AS VISITED + FROM path_up p + JOIN {fqn}.KG_EDGE e ON p.NODE_ID = e.SRC_ID + JOIN {fqn}.KG_NODE n ON e.DST_ID = n.NODE_ID + CROSS JOIN end_node en + WHERE e.EDGE_TYPE = 'subClassOf' + AND NOT ARRAY_CONTAINS(e.DST_ID::VARIANT, p.VISITED) + AND p.STEP < 20 +) +SELECT + STEP, + NODE_ID, + NODE_NAME, + CASE WHEN STEP = 0 THEN 'START' ELSE 'subClassOf' END AS RELATIONSHIP +FROM path_up +WHERE NODE_ID = (SELECT NODE_ID FROM end_node) + OR STEP <= ( + SELECT MIN(STEP) FROM path_up WHERE NODE_ID = (SELECT NODE_ID FROM end_node) + ) +QUALIFY ROW_NUMBER() OVER (PARTITION BY NODE_ID ORDER BY STEP) = 1 +ORDER BY STEP +$$; + +COMMENT ON FUNCTION GET_HIERARCHY_PATH_TOOL(VARCHAR, VARCHAR) IS +'Returns the path between two concepts in the ontology hierarchy. +Use for lineage queries like "path from X to Y". +Returns STEP number, NODE_ID, NODE_NAME, and RELATIONSHIP type.'; + + +-- ============================================================================ +-- GET_DIRECT_CHILDREN_TOOL: Get immediate children of a concept +-- ============================================================================ +-- Simple non-recursive single-hop query for direct children. +-- +-- Example: SELECT * FROM TABLE(GET_DIRECT_CHILDREN_TOOL('Vehicle')); +-- ============================================================================ + +CREATE OR REPLACE FUNCTION GET_DIRECT_CHILDREN_TOOL(PARENT_CONCEPT VARCHAR) +RETURNS TABLE ( + CHILD_ID VARCHAR, + CHILD_NAME VARCHAR, + CHILD_TYPE VARCHAR +) +LANGUAGE SQL +AS +$$ +SELECT + n.NODE_ID AS CHILD_ID, + n.NAME AS CHILD_NAME, + n.NODE_TYPE AS CHILD_TYPE +FROM {fqn}.KG_NODE parent +JOIN {fqn}.KG_EDGE e ON parent.NODE_ID = e.DST_ID +JOIN {fqn}.KG_NODE n ON e.SRC_ID = n.NODE_ID +WHERE LOWER(parent.NAME) = LOWER(PARENT_CONCEPT) +AND e.EDGE_TYPE = 'subClassOf' +ORDER BY n.NAME +$$; + +COMMENT ON FUNCTION GET_DIRECT_CHILDREN_TOOL(VARCHAR) IS +'Returns direct children (depth=1) of a concept. Use for incremental hierarchy exploration.'; + + +-- ============================================================================ +-- Verification queries (uncomment to test after deployment): +-- ============================================================================ +-- SELECT * FROM TABLE(EXPAND_DESCENDANTS_TOOL('YourRootConcept')) LIMIT 20; +-- SELECT * FROM TABLE(GET_DIRECT_CHILDREN_TOOL('YourRootConcept')); +-- SELECT * FROM TABLE(GET_ANCESTORS_TOOL('YourLeafConcept')); +-- SELECT * FROM TABLE(GET_HIERARCHY_PATH_TOOL('YourLeafConcept', 'YourRootConcept')); +""" + + +def main(): + parser = argparse.ArgumentParser(description="Generate Ontology SQL (Layers 1-3)") + parser.add_argument("--classes-json", required=True) + parser.add_argument("--relations-json", required=True) + parser.add_argument("--mappings-json", required=True) + parser.add_argument("--database", required=True) + parser.add_argument("--schema", required=True) + parser.add_argument("--ontology-name", required=True) + parser.add_argument("--kg-path", default="false", help="'true' for KG path, 'false' for direct table path") + parser.add_argument("--include-inference", default="false", help="'true' to generate inference engine SPs (KG path only)") + parser.add_argument("--include-graph-tools", default="false", help="'true' to generate SQL UDF graph traversal tools (KG path only)") + parser.add_argument("--output-dir", required=True) + args = parser.parse_args() + + kg_path = args.kg_path.lower() in ("true", "1", "yes") + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + with open(args.classes_json) as f: + classes = json.load(f) + with open(args.relations_json) as f: + relations = json.load(f) + with open(args.mappings_json) as f: + mappings = json.load(f) + + database = args.database + schema = args.schema + ontology_name = args.ontology_name + + print(f"Generating SQL for {ontology_name} ({'KG' if kg_path else 'direct'} path)") + print(f" {len(classes)} classes, {len(relations)} relations") + + files_written = [] + + # 01: Physical layer (KG path only) + if kg_path: + sql = generate_physical_layer_sql(classes, relations, mappings, database, schema) + path = output_dir / "01_physical_layer.sql" + path.write_text(sql) + files_written.append(str(path)) + print(f" Wrote {path}") + + # 02: Concrete entity & relationship views (V_{CLASS}, V_{REL}) + sql = generate_concrete_views_sql(mappings, database, schema, kg_path) + path = output_dir / "02_concrete_views.sql" + path.write_text(sql) + files_written.append(str(path)) + print(f" Wrote {path}") + + # 03: Metadata tables + sql = generate_metadata_sql(classes, relations, mappings, database, schema, ontology_name, kg_path) + path = output_dir / "03_metadata_tables.sql" + path.write_text(sql) + files_written.append(str(path)) + print(f" Wrote {path}") + + # 04: Abstract views + sql = generate_views_sql(classes, mappings, database, schema, ontology_name, kg_path) + path = output_dir / "04_abstract_views.sql" + path.write_text(sql) + files_written.append(str(path)) + print(f" Wrote {path}") + + # 05: View generator SP + sql = generate_view_generator_sp(database, schema, ontology_name) + path = output_dir / "05_view_generator_sp.sql" + path.write_text(sql) + files_written.append(str(path)) + print(f" Wrote {path}") + + # 06: Inference engine (optional, KG path only) + include_inference = args.include_inference.lower() in ("true", "1", "yes") + if kg_path and include_inference: + sql = generate_inference_sql(database, schema) + path = output_dir / "06_inference_engine.sql" + path.write_text(sql) + files_written.append(str(path)) + print(f" Wrote {path}") + + # 07: Graph traversal tools (optional, KG path only) + include_graph_tools = args.include_graph_tools.lower() in ("true", "1", "yes") + if kg_path and include_graph_tools: + sql = generate_graph_traversal_sql(database, schema) + path = output_dir / "07_graph_traversal_tools.sql" + path.write_text(sql) + files_written.append(str(path)) + print(f" Wrote {path}") + + print(f"\nGenerated {len(files_written)} SQL files in {output_dir}") + + +if __name__ == "__main__": + main() diff --git a/skills/ontology-stack-builder/scripts/generate_semantic_models.py b/skills/ontology-stack-builder/scripts/generate_semantic_models.py new file mode 100644 index 00000000..498fdc4d --- /dev/null +++ b/skills/ontology-stack-builder/scripts/generate_semantic_models.py @@ -0,0 +1,861 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pyyaml>=6.0", +# ] +# /// +""" +DEPRECATED: This script is no longer used by the ontology-stack-builder workflow. + +Semantic view generation is now handled by the native Cortex Code `semantic-view` skill, +which uses Snowflake's FastGen system function (SYSTEM$CORTEX_ANALYST_FAST_GENERATION) +to auto-discover columns, infer primary keys, generate dimensions/measures/metrics/ +relationships, and create verified queries directly from Snowflake metadata. + +This file is retained for reference only. Do not invoke it from the skill workflow. +See SKILL.md Phase 5 for the current workflow. +""" + +import argparse +import json +import sys +from datetime import datetime, timezone +from pathlib import Path + +import yaml + + +# Mapping from Snowflake types to Cortex Analyst semantic model data types +_ANALYST_TYPE_MAP = { + "NUMBER": "NUMBER", "DECIMAL": "NUMBER", "NUMERIC": "NUMBER", + "INT": "NUMBER", "INTEGER": "NUMBER", "BIGINT": "NUMBER", + "SMALLINT": "NUMBER", "TINYINT": "NUMBER", "BYTEINT": "NUMBER", + "FLOAT": "NUMBER", "FLOAT4": "NUMBER", "FLOAT8": "NUMBER", + "DOUBLE": "NUMBER", "DOUBLE PRECISION": "NUMBER", "REAL": "NUMBER", + "VARCHAR": "STRING", "CHAR": "STRING", "CHARACTER": "STRING", + "STRING": "STRING", "TEXT": "STRING", + "BOOLEAN": "BOOLEAN", + "DATE": "DATE", "DATETIME": "TIMESTAMP", "TIME": "TIME", + "TIMESTAMP": "TIMESTAMP", "TIMESTAMP_LTZ": "TIMESTAMP", + "TIMESTAMP_NTZ": "TIMESTAMP", "TIMESTAMP_TZ": "TIMESTAMP", + "VARIANT": "VARIANT", "OBJECT": "VARIANT", "ARRAY": "VARIANT", +} + + +def _analyst_type(col_type: str) -> str: + """Map a Snowflake column type to a Cortex Analyst data type.""" + base = col_type.upper().split("(")[0].strip() + return _ANALYST_TYPE_MAP.get(base, "STRING") + + +def now_ts() -> int: + return int(datetime.now(timezone.utc).timestamp()) + + +def generate_kg_model( + classes: list[dict], + relations: list[dict], + mappings: dict, + database: str, + schema: str, + ontology_name: str, + questions: list[str], +) -> dict: + """Generate the KG Semantic Model — concrete V_{CLASS} views for fast direct queries.""" + model = { + "name": f"{ontology_name}_KG_MODEL", + "description": ( + f"Knowledge Graph model for {ontology_name}. Contains concrete entity views " + f"(V_* per type) and relationship views. Use for SPECIFIC entity lookups, " + f"aggregations, named entity queries, and direct data access. " + f"Best for: 'Who scored most goals?', 'Which customer spent most?', entity-specific questions." + ), + "tables": [], + "relationships": [], + "verified_queries": [], + } + + class_maps = mappings.get("class_mappings", []) + rel_maps = mappings.get("relation_mappings", []) + + # Build class name -> class_map lookup for relationship resolution + cls_by_name = {cm["class_name"]: cm for cm in class_maps} + + # ── Per-class concrete entity views (V_{CLASS}) ── + for cm in class_maps: + cls_name = cm["class_name"] + view_name = f"V_{cls_name.upper()}" + columns = cm.get("columns", []) + id_col = cm["id_column"] + name_col = cm.get("name_column") + + table_def = { + "name": view_name, + "base_table": {"database": database, "schema": schema, "table": view_name}, + "description": f"Concrete view for {cls_name} entities with typed properties", + "primary_key": {"columns": ["NODE_ID"] if id_col.upper() == "NODE_ID" else [id_col]}, + "dimensions": [], + } + + # Add NODE_ID / primary key dimension + pk_name = "NODE_ID" if id_col.upper() in ("NODE_ID", id_col.upper()) else id_col + table_def["dimensions"].append({ + "name": pk_name, "expr": pk_name, "data_type": "STRING", + "description": f"{cls_name} identifier", + }) + + # Add NAME dimension + if name_col: + table_def["dimensions"].append({ + "name": "NAME", "expr": "NAME", "data_type": "STRING", + "description": f"Name of the {cls_name.lower()}", + }) + + # Add typed property dimensions from column metadata + facts_list = [] + for col in columns: + if col.get("is_primary_key") or col.get("is_foreign_key"): + continue + cn = col["name"].upper() + if cn in ("NODE_ID", "NODE_TYPE", "NAME", "PROPS", "TS_INGESTED"): + continue + at = _analyst_type(col["data_type"]) + dim = { + "name": col["name"].upper(), + "expr": col["name"].upper(), + "data_type": at, + "description": f"{col['name']} of {cls_name}", + } + if at == "NUMBER": + facts_list.append(dim) + else: + table_def["dimensions"].append(dim) + + # Add PROPS variant dimension + table_def["dimensions"].append({ + "name": "PROPS", "expr": "PROPS", "data_type": "VARIANT", + "description": f"Additional {cls_name.lower()} properties", + }) + + if facts_list: + table_def["facts"] = facts_list + + model["tables"].append(table_def) + + # ── Per-relation concrete relationship views (V_{REL}) — KG path ── + for rm in rel_maps: + rel_name = rm["rel_name"] + view_name = f"V_{rel_name.upper()}" + extra_cols = rm.get("columns", []) + domain = rm.get("domain_class", "") + range_cls = rm.get("range_class", "") + + table_def = { + "name": view_name, + "base_table": {"database": database, "schema": schema, "table": view_name}, + "description": f"Relationship view for {rel_name} ({domain} → {range_cls})", + "dimensions": [ + {"name": "SRC_ID", "expr": "SRC_ID", "data_type": "STRING", + "description": f"Source entity ID ({domain})"}, + {"name": "DST_ID", "expr": "DST_ID", "data_type": "STRING", + "description": f"Target entity ID ({range_cls})"}, + {"name": "EDGE_TYPE", "expr": "EDGE_TYPE", "data_type": "STRING", + "description": "Relationship type"}, + {"name": "PROPS", "expr": "PROPS", "data_type": "VARIANT", + "description": "Additional relationship properties"}, + ], + "time_dimensions": [ + {"name": "EFFECTIVE_START", "expr": "EFFECTIVE_START", "data_type": "DATE", + "description": "Start date of relationship"}, + {"name": "EFFECTIVE_END", "expr": "EFFECTIVE_END", "data_type": "DATE", + "description": "End date of relationship"}, + ], + "facts": [ + {"name": "WEIGHT", "expr": "WEIGHT", "data_type": "NUMBER", + "description": "Relationship weight/strength"}, + ], + } + + # Add extra typed columns from relationship metadata + for col in extra_cols: + at = _analyst_type(col["data_type"]) + dim = { + "name": col["name"].upper(), + "expr": col["name"].upper(), + "data_type": at, + "description": f"{col['name']} of {rel_name} relationship", + } + if at == "NUMBER": + table_def["facts"].append(dim) + else: + table_def["dimensions"].append(dim) + + model["tables"].append(table_def) + + # ── Add relationships between rel view and entity views ── + # SRC_ID → source entity view + if domain and domain in cls_by_name: + src_view = f"V_{domain.upper()}" + src_pk = "NODE_ID" + model["relationships"].append({ + "name": f"{rel_name.lower()}_to_{domain.lower()}", + "left_table": view_name, + "right_table": src_view, + "relationship_columns": [{"left_column": "SRC_ID", "right_column": src_pk}], + "join_type": "left_outer", + "relationship_type": "many_to_one", + }) + # DST_ID → target entity view + if range_cls and range_cls in cls_by_name: + dst_view = f"V_{range_cls.upper()}" + dst_pk = "NODE_ID" + model["relationships"].append({ + "name": f"{rel_name.lower()}_to_{range_cls.lower()}", + "left_table": view_name, + "right_table": dst_view, + "relationship_columns": [{"left_column": "DST_ID", "right_column": dst_pk}], + "join_type": "left_outer", + "relationship_type": "many_to_one", + }) + + # ── Verified queries ── + ts = now_ts() + if class_maps: + primary = class_maps[0] + cls_name = primary["class_name"] + view_name = f"V_{cls_name.upper()}" + model["verified_queries"].append({ + "name": f"list_{cls_name.lower()}", + "question": f"Show all {cls_name.lower()} entities", + "sql": f"SELECT NODE_ID, NAME FROM {view_name} ORDER BY NAME LIMIT 50", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }) + + if rel_maps: + first_rel = rel_maps[0] + rv = f"V_{first_rel['rel_name'].upper()}" + model["verified_queries"].append({ + "name": "relationship_sample", + "question": f"Show sample {first_rel['rel_name']} relationships", + "sql": f"SELECT SRC_ID, DST_ID, EDGE_TYPE, EFFECTIVE_START FROM {rv} LIMIT 20", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }) + + return model + + +def generate_ontology_model( + classes: list[dict], + relations: list[dict], + mappings: dict, + database: str, + schema: str, + ontology_name: str, + questions: list[str], +) -> dict: + """Generate the Ontology Semantic Model — abstract views for cross-type reasoning.""" + model = { + "name": f"{ontology_name}_ONTOLOGY_MODEL", + "description": ( + f"Ontology model for {ontology_name}. Contains abstract views (VW_ONT_*) " + f"that unify entity types and enable cross-type queries. Use for ABSTRACT reasoning: " + f"'What types of entities exist?', 'Show all people', hierarchy traversal, " + f"entity unification, polymorphic queries across types." + ), + "tables": [], + "verified_queries": [], + } + + class_maps = mappings.get("class_mappings", []) + + # Add abstract VW_ONT_* views + for cm in class_maps: + cls_name = cm["class_name"] + view_name = f"VW_ONT_{cls_name.upper()}" + + table_def = { + "name": view_name, + "base_table": {"database": database, "schema": schema, "table": view_name}, + "description": f"Abstract ontology view for {cls_name} — unified entity interface", + "primary_key": {"columns": ["ENTITY_ID"]}, + "dimensions": [ + {"name": "ENTITY_ID", "expr": "ENTITY_ID", "data_type": "VARCHAR", + "description": f"Unique identifier for {cls_name}"}, + {"name": "ENTITY_TYPE", "expr": "ENTITY_TYPE", "data_type": "VARCHAR", + "description": "Ontology class type"}, + {"name": "ENTITY_NAME", "expr": "ENTITY_NAME", "data_type": "VARCHAR", + "description": f"Display name for {cls_name}"}, + ], + } + model["tables"].append(table_def) + + # Unified entity view + if len(class_maps) > 1: + model["tables"].append({ + "name": "VW_ONT_ALL_ENTITIES", + "base_table": {"database": database, "schema": schema, "table": "VW_ONT_ALL_ENTITIES"}, + "description": "Unified view of ALL entity types. Use for cross-type queries.", + "primary_key": {"columns": ["ENTITY_ID"]}, + "dimensions": [ + {"name": "ENTITY_ID", "expr": "ENTITY_ID", "data_type": "VARCHAR"}, + {"name": "ENTITY_TYPE", "expr": "ENTITY_TYPE", "data_type": "VARCHAR", + "description": "The ontology class this entity belongs to"}, + {"name": "ENTITY_NAME", "expr": "ENTITY_NAME", "data_type": "VARCHAR"}, + ], + }) + + # REL_RESOLVED for cross-type relationship queries + model["tables"].append({ + "name": "REL_RESOLVED", + "base_table": {"database": database, "schema": schema, "table": "REL_RESOLVED"}, + "description": "Resolved relationships for cross-type reasoning", + "dimensions": [ + {"name": "REL_NAME", "expr": "REL_NAME", "data_type": "VARCHAR"}, + {"name": "SRC_ID", "expr": "SRC_ID", "data_type": "VARCHAR"}, + {"name": "SRC_NAME", "expr": "SRC_NAME", "data_type": "VARCHAR"}, + {"name": "SRC_TYPE", "expr": "SRC_TYPE", "data_type": "VARCHAR"}, + {"name": "DST_ID", "expr": "DST_ID", "data_type": "VARCHAR"}, + {"name": "DST_NAME", "expr": "DST_NAME", "data_type": "VARCHAR"}, + {"name": "DST_TYPE", "expr": "DST_TYPE", "data_type": "VARCHAR"}, + ], + "facts": [ + {"name": "WEIGHT", "expr": "WEIGHT", "data_type": "FLOAT"}, + ], + }) + + # Verified queries + ts = now_ts() + model["verified_queries"].extend([ + { + "name": "entity_type_counts", + "question": "How many entities of each type exist?", + "sql": "SELECT ENTITY_TYPE, COUNT(*) AS cnt FROM VW_ONT_ALL_ENTITIES GROUP BY ENTITY_TYPE ORDER BY cnt DESC", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }, + { + "name": "search_entities", + "question": "Find all entities matching a name pattern", + "sql": "SELECT ENTITY_ID, ENTITY_TYPE, ENTITY_NAME FROM VW_ONT_ALL_ENTITIES WHERE ENTITY_NAME ILIKE '%{search_term}%' ORDER BY ENTITY_TYPE, ENTITY_NAME LIMIT 50", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }, + { + "name": "cross_type_relationships", + "question": "What relationships connect different entity types?", + "sql": "SELECT SRC_TYPE, REL_NAME, DST_TYPE, COUNT(*) AS cnt FROM REL_RESOLVED GROUP BY SRC_TYPE, REL_NAME, DST_TYPE ORDER BY cnt DESC LIMIT 50", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }, + ]) + + return model + + +def generate_metadata_model( + database: str, + schema: str, + ontology_name: str, +) -> dict: + """Generate the Metadata & Governance Model — all ~23 introspection tables.""" + + def _tbl(name: str) -> dict: + return {"database": database, "schema": schema, "table": name} + + model = { + "name": f"{ontology_name}_METADATA_MODEL", + "description": ( + f"Metadata and governance model for {ontology_name}. Contains ontology structure " + f"definitions: classes, relations, properties, interfaces, rules, actions, roles, " + f"permissions, functions, and data quality. Use for INTROSPECTION: " + f"'What classes are defined?', 'What properties does X have?', 'Who has access?', " + f"schema governance, data catalog questions." + ), + "tables": [ + # ONT_ONTOLOGY + { + "name": "ONT_ONTOLOGY", + "base_table": _tbl("ONT_ONTOLOGY"), + "description": "Ontology registry with versioning", + "primary_key": {"columns": ["ONTOLOGY_NAME"]}, + "dimensions": [ + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING", "description": "Unique ontology identifier"}, + {"name": "VERSION", "expr": "VERSION", "data_type": "STRING", "description": "Ontology version"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + {"name": "DEFAULT_SCHEMA", "expr": "DEFAULT_SCHEMA", "data_type": "STRING"}, + {"name": "CREATED_BY", "expr": "CREATED_BY", "data_type": "STRING"}, + {"name": "IS_ACTIVE", "expr": "IS_ACTIVE", "data_type": "BOOLEAN"}, + ], + "time_dimensions": [ + {"name": "CREATED_AT", "expr": "CREATED_AT", "data_type": "TIMESTAMP"}, + ], + }, + # ONT_OBJECT_SOURCE + { + "name": "ONT_OBJECT_SOURCE", + "base_table": _tbl("ONT_OBJECT_SOURCE"), + "description": "Maps object types to source tables with column mappings", + "primary_key": {"columns": ["ONTOLOGY_NAME", "OBJ_TYPE", "SOURCE_TABLE"]}, + "dimensions": [ + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + {"name": "OBJ_TYPE", "expr": "OBJ_TYPE", "data_type": "STRING", "description": "Object type name"}, + {"name": "SOURCE_TABLE", "expr": "SOURCE_TABLE", "data_type": "STRING"}, + {"name": "FILTER_SQL", "expr": "FILTER_SQL", "data_type": "STRING"}, + {"name": "MAPPING", "expr": "MAPPING", "data_type": "VARIANT", "description": "JSON column mapping"}, + ], + }, + # ONT_LINK_SOURCE + { + "name": "ONT_LINK_SOURCE", + "base_table": _tbl("ONT_LINK_SOURCE"), + "description": "Maps link types to source tables with column mappings", + "primary_key": {"columns": ["ONTOLOGY_NAME", "LINK_TYPE", "SOURCE_TABLE"]}, + "dimensions": [ + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + {"name": "LINK_TYPE", "expr": "LINK_TYPE", "data_type": "STRING", "description": "Link type name"}, + {"name": "SOURCE_TABLE", "expr": "SOURCE_TABLE", "data_type": "STRING"}, + {"name": "FILTER_SQL", "expr": "FILTER_SQL", "data_type": "STRING"}, + {"name": "MAPPING", "expr": "MAPPING", "data_type": "VARIANT"}, + ], + }, + # ONT_CLASS + { + "name": "ONT_CLASS", + "base_table": _tbl("ONT_CLASS"), + "description": "Object type definitions with class hierarchy", + "primary_key": {"columns": ["CLASS_NAME"]}, + "dimensions": [ + {"name": "CLASS_NAME", "expr": "CLASS_NAME", "data_type": "STRING", "description": "Unique class name"}, + {"name": "PARENT_CLASS_NAME", "expr": "PARENT_CLASS_NAME", "data_type": "STRING", "description": "Parent class for hierarchy"}, + {"name": "IS_ABSTRACT", "expr": "IS_ABSTRACT", "data_type": "BOOLEAN"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + {"name": "TYPE_CLASS", "expr": "TYPE_CLASS", "data_type": "STRING", "description": "ANALYTICAL or OPERATIONAL"}, + ], + }, + # ONT_RELATION_DEF + { + "name": "ONT_RELATION_DEF", + "base_table": _tbl("ONT_RELATION_DEF"), + "description": "Relationship type definitions with cardinality", + "primary_key": {"columns": ["REL_NAME"]}, + "dimensions": [ + {"name": "REL_NAME", "expr": "REL_NAME", "data_type": "STRING", "description": "Relationship type name"}, + {"name": "DOMAIN_CLASS", "expr": "DOMAIN_CLASS", "data_type": "STRING", "description": "Source class"}, + {"name": "RANGE_CLASS", "expr": "RANGE_CLASS", "data_type": "STRING", "description": "Target class"}, + {"name": "CARDINALITY", "expr": "CARDINALITY", "data_type": "STRING"}, + {"name": "IS_HIERARCHICAL", "expr": "IS_HIERARCHICAL", "data_type": "BOOLEAN"}, + {"name": "INVERSE_REL_NAME", "expr": "INVERSE_REL_NAME", "data_type": "STRING"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + ], + }, + # ONT_SHARED_PROPERTY + { + "name": "ONT_SHARED_PROPERTY", + "base_table": _tbl("ONT_SHARED_PROPERTY"), + "description": "Shared properties reusable across object types", + "primary_key": {"columns": ["SHARED_PROP_NAME"]}, + "dimensions": [ + {"name": "SHARED_PROP_NAME", "expr": "SHARED_PROP_NAME", "data_type": "STRING"}, + {"name": "BASE_TYPE", "expr": "BASE_TYPE", "data_type": "STRING"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + {"name": "DEFAULT_FORMAT", "expr": "DEFAULT_FORMAT", "data_type": "STRING"}, + ], + }, + # ONT_PROPERTY + { + "name": "ONT_PROPERTY", + "base_table": _tbl("ONT_PROPERTY"), + "description": "Properties for each object type", + "primary_key": {"columns": ["CLASS_NAME", "PROP_NAME"]}, + "dimensions": [ + {"name": "CLASS_NAME", "expr": "CLASS_NAME", "data_type": "STRING"}, + {"name": "PROP_NAME", "expr": "PROP_NAME", "data_type": "STRING"}, + {"name": "DATA_TYPE", "expr": "DATA_TYPE", "data_type": "STRING"}, + {"name": "SHARED_PROP_NAME", "expr": "SHARED_PROP_NAME", "data_type": "STRING"}, + {"name": "IS_REQUIRED", "expr": "IS_REQUIRED", "data_type": "BOOLEAN"}, + {"name": "IS_INDEXED", "expr": "IS_INDEXED", "data_type": "BOOLEAN"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + ], + }, + # ONT_DERIVED_PROPERTY + { + "name": "ONT_DERIVED_PROPERTY", + "base_table": _tbl("ONT_DERIVED_PROPERTY"), + "description": "Computed/derived properties with definitions", + "primary_key": {"columns": ["CLASS_NAME", "PROP_NAME"]}, + "dimensions": [ + {"name": "CLASS_NAME", "expr": "CLASS_NAME", "data_type": "STRING"}, + {"name": "PROP_NAME", "expr": "PROP_NAME", "data_type": "STRING"}, + {"name": "DEFINITION_KIND", "expr": "DEFINITION_KIND", "data_type": "STRING", "description": "SQL or FUNCTION"}, + {"name": "SQL_EXPR", "expr": "SQL_EXPR", "data_type": "STRING"}, + {"name": "FUNCTION_NAME", "expr": "FUNCTION_NAME", "data_type": "STRING"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + ], + }, + # ONT_INTERFACE + { + "name": "ONT_INTERFACE", + "base_table": _tbl("ONT_INTERFACE"), + "description": "Interface definitions for polymorphism", + "primary_key": {"columns": ["INTERFACE_NAME"]}, + "dimensions": [ + {"name": "INTERFACE_NAME", "expr": "INTERFACE_NAME", "data_type": "STRING"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + ], + }, + # ONT_INTERFACE_PROPERTY + { + "name": "ONT_INTERFACE_PROPERTY", + "base_table": _tbl("ONT_INTERFACE_PROPERTY"), + "description": "Properties required by each interface", + "primary_key": {"columns": ["INTERFACE_NAME", "PROP_NAME"]}, + "dimensions": [ + {"name": "INTERFACE_NAME", "expr": "INTERFACE_NAME", "data_type": "STRING"}, + {"name": "PROP_NAME", "expr": "PROP_NAME", "data_type": "STRING"}, + {"name": "SHARED_PROP_NAME", "expr": "SHARED_PROP_NAME", "data_type": "STRING"}, + ], + }, + # ONT_INTERFACE_IMPL + { + "name": "ONT_INTERFACE_IMPL", + "base_table": _tbl("ONT_INTERFACE_IMPL"), + "description": "Maps classes to interfaces they implement", + "primary_key": {"columns": ["INTERFACE_NAME", "CLASS_NAME"]}, + "dimensions": [ + {"name": "INTERFACE_NAME", "expr": "INTERFACE_NAME", "data_type": "STRING"}, + {"name": "CLASS_NAME", "expr": "CLASS_NAME", "data_type": "STRING"}, + ], + }, + # ONT_CLASS_MAP + { + "name": "ONT_CLASS_MAP", + "base_table": _tbl("ONT_CLASS_MAP"), + "description": "Maps ontology classes to physical tables", + "dimensions": [ + {"name": "CLASS_NAME", "expr": "CLASS_NAME", "data_type": "STRING"}, + {"name": "SOURCE_DATABASE", "expr": "SOURCE_DATABASE", "data_type": "STRING"}, + {"name": "SOURCE_SCHEMA", "expr": "SOURCE_SCHEMA", "data_type": "STRING"}, + {"name": "SOURCE_TABLE", "expr": "SOURCE_TABLE", "data_type": "STRING"}, + {"name": "ID_EXPR", "expr": "ID_EXPR", "data_type": "STRING"}, + {"name": "NAME_EXPR", "expr": "NAME_EXPR", "data_type": "STRING"}, + ], + }, + # ONT_REL_MAP + { + "name": "ONT_REL_MAP", + "base_table": _tbl("ONT_REL_MAP"), + "description": "Maps ontology relationships to physical tables", + "dimensions": [ + {"name": "REL_NAME", "expr": "REL_NAME", "data_type": "STRING"}, + {"name": "SOURCE_DATABASE", "expr": "SOURCE_DATABASE", "data_type": "STRING"}, + {"name": "SOURCE_SCHEMA", "expr": "SOURCE_SCHEMA", "data_type": "STRING"}, + {"name": "SOURCE_TABLE", "expr": "SOURCE_TABLE", "data_type": "STRING"}, + {"name": "SRC_ID_EXPR", "expr": "SRC_ID_EXPR", "data_type": "STRING"}, + {"name": "DST_ID_EXPR", "expr": "DST_ID_EXPR", "data_type": "STRING"}, + ], + }, + # ONT_RULE + { + "name": "ONT_RULE", + "base_table": _tbl("ONT_RULE"), + "description": "Inference rule registry", + "primary_key": {"columns": ["RULE_ID"]}, + "dimensions": [ + {"name": "RULE_ID", "expr": "RULE_ID", "data_type": "STRING"}, + {"name": "RULE_KIND", "expr": "RULE_KIND", "data_type": "STRING", "description": "TRANSITIVE, PROPERTY_CHAIN, or INVERSE"}, + {"name": "TARGET_REL", "expr": "TARGET_REL", "data_type": "STRING"}, + {"name": "IS_ENABLED", "expr": "IS_ENABLED", "data_type": "BOOLEAN"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + ], + "time_dimensions": [ + {"name": "TS_CREATED", "expr": "TS_CREATED", "data_type": "TIMESTAMP"}, + ], + }, + # REL_EDGE_INFERRED + { + "name": "REL_EDGE_INFERRED", + "base_table": _tbl("REL_EDGE_INFERRED"), + "description": "Inferred relationships from rules", + "primary_key": {"columns": ["REL_NAME", "SRC_ID", "DST_ID", "RULE_ID"]}, + "dimensions": [ + {"name": "REL_NAME", "expr": "REL_NAME", "data_type": "STRING"}, + {"name": "SRC_ID", "expr": "SRC_ID", "data_type": "STRING"}, + {"name": "DST_ID", "expr": "DST_ID", "data_type": "STRING"}, + {"name": "INFERENCE_KIND", "expr": "INFERENCE_KIND", "data_type": "STRING"}, + {"name": "RULE_ID", "expr": "RULE_ID", "data_type": "STRING"}, + ], + "time_dimensions": [ + {"name": "COMPUTED_AT", "expr": "COMPUTED_AT", "data_type": "TIMESTAMP"}, + ], + "facts": [ + {"name": "WEIGHT", "expr": "WEIGHT", "data_type": "NUMBER"}, + ], + }, + # ONT_CONSTRAINT_VIOLATION + { + "name": "ONT_CONSTRAINT_VIOLATION", + "base_table": _tbl("ONT_CONSTRAINT_VIOLATION"), + "description": "Data quality constraint violations", + "primary_key": {"columns": ["VIOLATION_ID"]}, + "dimensions": [ + {"name": "VIOLATION_ID", "expr": "VIOLATION_ID", "data_type": "STRING"}, + {"name": "CHECK_NAME", "expr": "CHECK_NAME", "data_type": "STRING"}, + {"name": "SCOPE", "expr": "SCOPE", "data_type": "STRING"}, + {"name": "REL_OR_CLASS", "expr": "REL_OR_CLASS", "data_type": "STRING"}, + {"name": "DETAILS", "expr": "DETAILS", "data_type": "STRING"}, + ], + "time_dimensions": [ + {"name": "OBSERVED_AT", "expr": "OBSERVED_AT", "data_type": "TIMESTAMP"}, + ], + }, + # ACT_TYPE + { + "name": "ACT_TYPE", + "base_table": _tbl("ACT_TYPE"), + "description": "Action type definitions", + "primary_key": {"columns": ["ACTION_TYPE_ID"]}, + "dimensions": [ + {"name": "ACTION_TYPE_ID", "expr": "ACTION_TYPE_ID", "data_type": "STRING"}, + {"name": "ACTION_NAME", "expr": "ACTION_NAME", "data_type": "STRING"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + {"name": "TARGET_CLASS", "expr": "TARGET_CLASS", "data_type": "STRING"}, + {"name": "HANDLER_PROC", "expr": "HANDLER_PROC", "data_type": "STRING"}, + {"name": "IS_ENABLED", "expr": "IS_ENABLED", "data_type": "BOOLEAN"}, + ], + "time_dimensions": [ + {"name": "TS_CREATED", "expr": "TS_CREATED", "data_type": "TIMESTAMP"}, + ], + }, + # ACT_DEF + { + "name": "ACT_DEF", + "base_table": _tbl("ACT_DEF"), + "description": "Action parameter definitions", + "primary_key": {"columns": ["ACTION_TYPE_ID", "PARAM_NAME"]}, + "dimensions": [ + {"name": "ACTION_TYPE_ID", "expr": "ACTION_TYPE_ID", "data_type": "STRING"}, + {"name": "PARAM_NAME", "expr": "PARAM_NAME", "data_type": "STRING"}, + {"name": "PARAM_TYPE", "expr": "PARAM_TYPE", "data_type": "STRING"}, + {"name": "IS_REQUIRED", "expr": "IS_REQUIRED", "data_type": "BOOLEAN"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + ], + }, + # ONT_FUNCTION + { + "name": "ONT_FUNCTION", + "base_table": _tbl("ONT_FUNCTION"), + "description": "Function catalog — versioned code artifacts", + "primary_key": {"columns": ["ONTOLOGY_NAME", "FUNCTION_NAME", "VERSION"]}, + "dimensions": [ + {"name": "FUNCTION_NAME", "expr": "FUNCTION_NAME", "data_type": "STRING"}, + {"name": "VERSION", "expr": "VERSION", "data_type": "STRING"}, + {"name": "LANGUAGE", "expr": "LANGUAGE", "data_type": "STRING", "description": "SQL, PYTHON, JS, or EXTERNAL"}, + {"name": "SNOWFLAKE_REF", "expr": "SNOWFLAKE_REF", "data_type": "STRING"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + ], + }, + # ONT_FUNCTION_BINDING + { + "name": "ONT_FUNCTION_BINDING", + "base_table": _tbl("ONT_FUNCTION_BINDING"), + "description": "Binds functions to object types, link types, or actions", + "dimensions": [ + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + {"name": "FUNCTION_NAME", "expr": "FUNCTION_NAME", "data_type": "STRING"}, + {"name": "VERSION", "expr": "VERSION", "data_type": "STRING"}, + {"name": "BOUND_TO_KIND", "expr": "BOUND_TO_KIND", "data_type": "STRING", "description": "OBJECT_TYPE, LINK_TYPE, or ACTION_TYPE"}, + {"name": "BOUND_TO_NAME", "expr": "BOUND_TO_NAME", "data_type": "STRING"}, + ], + }, + # OBJ_VIEW_DEF + { + "name": "OBJ_VIEW_DEF", + "base_table": _tbl("OBJ_VIEW_DEF"), + "description": "Object view definitions for UI and governance", + "primary_key": {"columns": ["OBJ_TYPE", "VIEW_NAME"]}, + "dimensions": [ + {"name": "OBJ_TYPE", "expr": "OBJ_TYPE", "data_type": "STRING"}, + {"name": "VIEW_NAME", "expr": "VIEW_NAME", "data_type": "STRING"}, + {"name": "CREATED_BY", "expr": "CREATED_BY", "data_type": "STRING"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + {"name": "DISPLAY_COLS", "expr": "DISPLAY_COLS", "data_type": "VARIANT"}, + {"name": "VERSION", "expr": "VERSION", "data_type": "STRING"}, + {"name": "STATUS", "expr": "STATUS", "data_type": "STRING"}, + ], + "time_dimensions": [ + {"name": "TS_CREATED", "expr": "TS_CREATED", "data_type": "TIMESTAMP"}, + ], + }, + # OBJ_VIEW_FIELD + { + "name": "OBJ_VIEW_FIELD", + "base_table": _tbl("OBJ_VIEW_FIELD"), + "description": "Field configuration for object views", + "primary_key": {"columns": ["OBJ_TYPE", "VIEW_NAME", "VERSION", "PROP_NAME"]}, + "dimensions": [ + {"name": "OBJ_TYPE", "expr": "OBJ_TYPE", "data_type": "STRING"}, + {"name": "VIEW_NAME", "expr": "VIEW_NAME", "data_type": "STRING"}, + {"name": "VERSION", "expr": "VERSION", "data_type": "STRING"}, + {"name": "PROP_NAME", "expr": "PROP_NAME", "data_type": "STRING"}, + {"name": "RENDER_HINT", "expr": "RENDER_HINT", "data_type": "STRING"}, + ], + "facts": [ + {"name": "FIELD_ORDER", "expr": "FIELD_ORDER", "data_type": "NUMBER"}, + ], + }, + # ONT_ROLE + { + "name": "ONT_ROLE", + "base_table": _tbl("ONT_ROLE"), + "description": "Ontology-specific roles for access control", + "primary_key": {"columns": ["ONTOLOGY_NAME", "ONT_ROLE_NAME"]}, + "dimensions": [ + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + {"name": "ONT_ROLE_NAME", "expr": "ONT_ROLE_NAME", "data_type": "STRING"}, + {"name": "DESCRIPTION", "expr": "DESCRIPTION", "data_type": "STRING"}, + ], + }, + # ONT_ROLE_BINDING + { + "name": "ONT_ROLE_BINDING", + "base_table": _tbl("ONT_ROLE_BINDING"), + "description": "Maps ontology roles to Snowflake roles", + "primary_key": {"columns": ["ONTOLOGY_NAME", "ONT_ROLE_NAME", "SNOWFLAKE_ROLE"]}, + "dimensions": [ + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + {"name": "ONT_ROLE_NAME", "expr": "ONT_ROLE_NAME", "data_type": "STRING"}, + {"name": "SNOWFLAKE_ROLE", "expr": "SNOWFLAKE_ROLE", "data_type": "STRING"}, + ], + }, + # ONT_PERMISSION + { + "name": "ONT_PERMISSION", + "base_table": _tbl("ONT_PERMISSION"), + "description": "Granular permissions for object types, link types, and actions", + "primary_key": {"columns": ["ONTOLOGY_NAME", "SUBJECT_KIND", "SUBJECT_NAME", "ONT_ROLE_NAME", "PRIVILEGE"]}, + "dimensions": [ + {"name": "ONTOLOGY_NAME", "expr": "ONTOLOGY_NAME", "data_type": "STRING"}, + {"name": "SUBJECT_KIND", "expr": "SUBJECT_KIND", "data_type": "STRING", "description": "OBJECT_TYPE, LINK_TYPE, or ACTION_TYPE"}, + {"name": "SUBJECT_NAME", "expr": "SUBJECT_NAME", "data_type": "STRING"}, + {"name": "ONT_ROLE_NAME", "expr": "ONT_ROLE_NAME", "data_type": "STRING"}, + {"name": "PRIVILEGE", "expr": "PRIVILEGE", "data_type": "STRING", "description": "READ, WRITE, EXECUTE, or ADMIN"}, + ], + }, + ], + "verified_queries": [], + } + + ts = now_ts() + model["verified_queries"].extend([ + { + "name": "list_all_classes", + "question": "What ontology classes are defined?", + "sql": "SELECT CLASS_NAME, PARENT_CLASS_NAME, IS_ABSTRACT, DESCRIPTION FROM ONT_CLASS ORDER BY CLASS_NAME", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }, + { + "name": "list_all_relations", + "question": "What relationship types exist in the ontology?", + "sql": "SELECT REL_NAME, DOMAIN_CLASS, RANGE_CLASS, CARDINALITY FROM ONT_RELATION_DEF ORDER BY REL_NAME", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }, + { + "name": "class_properties", + "question": "What properties does a given class have?", + "sql": "SELECT CLASS_NAME, PROP_NAME, DATA_TYPE, IS_REQUIRED, DESCRIPTION FROM ONT_PROPERTY ORDER BY CLASS_NAME, PROP_NAME", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }, + { + "name": "roles_and_permissions", + "question": "What roles and permissions are defined?", + "sql": "SELECT r.ONT_ROLE_NAME, r.DESCRIPTION, COUNT(p.PRIVILEGE) AS perm_count FROM ONT_ROLE r LEFT JOIN ONT_PERMISSION p ON r.ONTOLOGY_NAME = p.ONTOLOGY_NAME AND r.ONT_ROLE_NAME = p.ONT_ROLE_NAME GROUP BY r.ONT_ROLE_NAME, r.DESCRIPTION ORDER BY r.ONT_ROLE_NAME", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }, + { + "name": "inference_rules", + "question": "What inference rules are defined?", + "sql": "SELECT RULE_ID, RULE_KIND, TARGET_REL, IS_ENABLED, DESCRIPTION FROM ONT_RULE ORDER BY RULE_ID", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }, + { + "name": "data_quality_violations", + "question": "Are there any data quality violations?", + "sql": "SELECT CHECK_NAME, SCOPE, REL_OR_CLASS, COUNT(*) AS violation_count FROM ONT_CONSTRAINT_VIOLATION GROUP BY CHECK_NAME, SCOPE, REL_OR_CLASS ORDER BY violation_count DESC", + "verified_at": ts, + "verified_by": "ontology-stack-builder", + }, + ]) + + return model + + +def main(): + parser = argparse.ArgumentParser(description="Generate Semantic Model YAMLs") + parser.add_argument("--classes-json", required=True) + parser.add_argument("--relations-json", required=True) + parser.add_argument("--mappings-json", required=True) + parser.add_argument("--database", required=True) + parser.add_argument("--schema", required=True) + parser.add_argument("--ontology-name", required=True) + parser.add_argument("--models", required=True, help="Comma-separated: kg,ontology,metadata") + parser.add_argument("--questions", default="", help="Pipe-separated business questions") + parser.add_argument("--output-dir", required=True) + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + with open(args.classes_json) as f: + classes = json.load(f) + with open(args.relations_json) as f: + relations = json.load(f) + with open(args.mappings_json) as f: + mappings = json.load(f) + + questions = [q.strip() for q in args.questions.split("|") if q.strip()] + selected_models = [m.strip().lower() for m in args.models.split(",")] + + database = args.database + schema = args.schema + ontology_name = args.ontology_name + + print(f"Generating semantic models: {', '.join(selected_models)}") + files_written = [] + + if "kg" in selected_models: + model = generate_kg_model(classes, relations, mappings, database, schema, ontology_name, questions) + path = output_dir / "kg_semantic_model.yaml" + path.write_text(yaml.dump(model, default_flow_style=False, sort_keys=False, allow_unicode=True, width=120)) + files_written.append(str(path)) + print(f" Wrote {path} ({len(model['tables'])} tables, {len(model['verified_queries'])} verified queries)") + + if "ontology" in selected_models: + model = generate_ontology_model(classes, relations, mappings, database, schema, ontology_name, questions) + path = output_dir / "ontology_semantic_model.yaml" + path.write_text(yaml.dump(model, default_flow_style=False, sort_keys=False, allow_unicode=True, width=120)) + files_written.append(str(path)) + print(f" Wrote {path} ({len(model['tables'])} tables, {len(model['verified_queries'])} verified queries)") + + if "metadata" in selected_models: + model = generate_metadata_model(database, schema, ontology_name) + path = output_dir / "metadata_governance_model.yaml" + path.write_text(yaml.dump(model, default_flow_style=False, sort_keys=False, allow_unicode=True, width=120)) + files_written.append(str(path)) + print(f" Wrote {path} ({len(model['tables'])} tables, {len(model['verified_queries'])} verified queries)") + + print(f"\nGenerated {len(files_written)} semantic model files") + + +if __name__ == "__main__": + main() diff --git a/skills/ontology-stack-builder/scripts/generate_spcs_scaffolding.py b/skills/ontology-stack-builder/scripts/generate_spcs_scaffolding.py new file mode 100644 index 00000000..7866499b --- /dev/null +++ b/skills/ontology-stack-builder/scripts/generate_spcs_scaffolding.py @@ -0,0 +1,334 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [] +# /// +""" +Generate SPCS Graph Analytics Scaffolding. + +Produces the SPCS service and setup SQL for graph analytics (NetworkX-based). +Agent creation is handled by the native Cortex Code `cortex-agent` skill. + +Generates: + - spcs_graph_service.py: NetworkX-based graph analytics Flask service for SPCS + - spcs_setup.sql: SPCS compute pool, service, and service function DDL + +Usage: + uv run --project python /scripts/generate_spcs_scaffolding.py \ + --database MYDB --schema MYSCHEMA \ + --output-dir /tmp/generated +""" + +import argparse +import sys +from pathlib import Path + + +def generate_spcs_graph_service(database: str, schema: str) -> str: + """Generate the Python graph analytics service for SPCS.""" + return '''""" +Graph Analytics Service for Snowpark Container Services. + +Provides NetworkX-based graph analytics exposed via Flask endpoints. +Reads KG_NODE and KG_EDGE tables from Snowflake to build the graph. +""" + +import os +import json +import logging +from flask import Flask, request, jsonify + +import networkx as nx +from snowflake.snowpark import Session +from community import community_louvain + +app = Flask(__name__) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Global graph cache +_graph: nx.Graph | None = None +_node_attrs: dict = {} + + +def get_session() -> Session: + """Create Snowpark session from SPCS environment.""" + return Session.builder.configs({ + "account": os.environ["SNOWFLAKE_ACCOUNT"], + "host": os.environ["SNOWFLAKE_HOST"], + "token": open("/snowflake/session/token").read().strip(), + "authenticator": "oauth", + "database": "''' + database + '''", + "schema": "''' + schema + '''", + }).create() + + +def load_graph() -> nx.Graph: + """Load KG_NODE and KG_EDGE into a NetworkX graph.""" + global _graph, _node_attrs + if _graph is not None: + return _graph + + logger.info("Loading graph from KG_NODE/KG_EDGE...") + session = get_session() + + nodes_df = session.sql("SELECT NODE_ID, NODE_TYPE, NAME FROM KG_NODE").collect() + edges_df = session.sql("SELECT SRC_ID, DST_ID, EDGE_TYPE, WEIGHT FROM KG_EDGE WHERE EFFECTIVE_END IS NULL").collect() + + G = nx.Graph() + for row in nodes_df: + G.add_node(row["NODE_ID"], node_type=row["NODE_TYPE"], name=row["NAME"]) + _node_attrs[row["NODE_ID"]] = {"type": row["NODE_TYPE"], "name": row["NAME"]} + + for row in edges_df: + G.add_edge(row["SRC_ID"], row["DST_ID"], + edge_type=row["EDGE_TYPE"], + weight=float(row["WEIGHT"] or 1.0)) + + _graph = G + logger.info(f"Graph loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges") + session.close() + return G + + +@app.route("/health", methods=["GET"]) +def health(): + return jsonify({"status": "healthy"}) + + +@app.route("/centrality", methods=["POST"]) +def centrality(): + """Compute centrality metrics.""" + data = request.json + metric = data.get("metric", "degree") + entity_type = data.get("entity_type") + top_n = data.get("top_n", 10) + + G = load_graph() + + if entity_type: + nodes = [n for n, d in G.nodes(data=True) if d.get("node_type") == entity_type] + subgraph = G.subgraph(nodes) + else: + subgraph = G + + if metric == "degree": + scores = nx.degree_centrality(subgraph) + elif metric == "betweenness": + scores = nx.betweenness_centrality(subgraph, k=min(100, len(subgraph))) + elif metric == "pagerank": + scores = nx.pagerank(subgraph) + else: + return jsonify({"error": f"Unknown metric: {metric}"}), 400 + + top = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n] + results = [] + for node_id, score in top: + attrs = _node_attrs.get(node_id, {}) + results.append({ + "node_id": node_id, + "name": attrs.get("name", ""), + "type": attrs.get("type", ""), + "score": round(score, 6), + }) + + return jsonify({"metric": metric, "results": results}) + + +@app.route("/community", methods=["POST"]) +def community_detection(): + """Detect communities using Louvain.""" + data = request.json or {} + resolution = data.get("resolution", 1.0) + + G = load_graph() + partition = community_louvain.best_partition(G, resolution=resolution) + + communities = {} + for node_id, comm_id in partition.items(): + if comm_id not in communities: + communities[comm_id] = [] + attrs = _node_attrs.get(node_id, {}) + communities[comm_id].append({ + "node_id": node_id, + "name": attrs.get("name", ""), + "type": attrs.get("type", ""), + }) + + summary = [{"community_id": cid, "size": len(members), "sample": members[:5]} + for cid, members in sorted(communities.items(), key=lambda x: -len(x[1]))] + + return jsonify({"num_communities": len(communities), "communities": summary[:20]}) + + +@app.route("/shortest_path", methods=["POST"]) +def shortest_path(): + """Find shortest path between two nodes.""" + data = request.json + source = data.get("source_id") + target = data.get("target_id") + + if not source or not target: + return jsonify({"error": "source_id and target_id are required"}), 400 + + G = load_graph() + + if source not in G: + return jsonify({"error": f"Source node {source} not found"}), 404 + if target not in G: + return jsonify({"error": f"Target node {target} not found"}), 404 + + try: + path = nx.shortest_path(G, source, target) + except nx.NetworkXNoPath: + return jsonify({"error": "No path exists between these nodes", "path": []}) + + path_details = [] + for i, node_id in enumerate(path): + attrs = _node_attrs.get(node_id, {}) + entry = {"step": i, "node_id": node_id, "name": attrs.get("name", ""), "type": attrs.get("type", "")} + if i < len(path) - 1: + edge_data = G.edges[node_id, path[i + 1]] + entry["edge_type"] = edge_data.get("edge_type", "") + path_details.append(entry) + + return jsonify({"length": len(path) - 1, "path": path_details}) + + +@app.route("/reload", methods=["POST"]) +def reload_graph(): + """Force reload the graph from Snowflake.""" + global _graph, _node_attrs + _graph = None + _node_attrs = {} + load_graph() + return jsonify({"status": "reloaded", "nodes": _graph.number_of_nodes(), "edges": _graph.number_of_edges()}) + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=8080) +''' + + +def generate_spcs_setup_sql(database: str, schema: str) -> str: + """Generate SPCS setup SQL for graph analytics.""" + return f"""-- ============================================================================= +-- SPCS Graph Analytics Setup +-- Generated by ontology-stack-builder +-- ============================================================================= + +-- 1. Create compute pool (adjust size as needed) +CREATE COMPUTE POOL IF NOT EXISTS {schema}_GRAPH_POOL + MIN_NODES = 1 + MAX_NODES = 1 + INSTANCE_FAMILY = CPU_X64_XS + AUTO_RESUME = TRUE + AUTO_SUSPEND_SECS = 300; + +-- 2. Create image repository +CREATE IMAGE REPOSITORY IF NOT EXISTS {database}.{schema}.GRAPH_IMAGES; + +-- 3. Build and push the Docker image: +-- docker build -t graph-analytics . +-- docker tag graph-analytics /graph-analytics:latest +-- docker push /graph-analytics:latest +-- +-- Get repo URL with: +-- SHOW IMAGE REPOSITORIES IN SCHEMA {database}.{schema}; + +-- 4. Create the service +CREATE SERVICE IF NOT EXISTS {database}.{schema}.GRAPH_ANALYTICS_SERVICE + IN COMPUTE POOL {schema}_GRAPH_POOL + MIN_INSTANCES = 1 + MAX_INSTANCES = 1 + FROM SPECIFICATION $$ + spec: + containers: + - name: graph-analytics + image: /{{repo_url}}/graph-analytics:latest + resources: + requests: + cpu: 1 + memory: 2Gi + limits: + cpu: 2 + memory: 4Gi + env: + SNOWFLAKE_ACCOUNT: {{{{SNOWFLAKE_ACCOUNT}}}} + SNOWFLAKE_HOST: {{{{SNOWFLAKE_HOST}}}} + readinessProbe: + path: /health + port: 8080 + endpoints: + - name: graph-api + port: 8080 + public: false + $$; + +-- 5. Create service functions +CREATE OR REPLACE FUNCTION {database}.{schema}.GRAPH_CENTRALITY( + metric VARCHAR, + entity_type VARCHAR DEFAULT NULL, + top_n INTEGER DEFAULT 10 +) +RETURNS VARIANT +SERVICE = {database}.{schema}.GRAPH_ANALYTICS_SERVICE +ENDPOINT = 'graph-api' +AS '/centrality'; + +CREATE OR REPLACE FUNCTION {database}.{schema}.GRAPH_COMMUNITY_DETECTION( + resolution FLOAT DEFAULT 1.0 +) +RETURNS VARIANT +SERVICE = {database}.{schema}.GRAPH_ANALYTICS_SERVICE +ENDPOINT = 'graph-api' +AS '/community'; + +CREATE OR REPLACE FUNCTION {database}.{schema}.GRAPH_SHORTEST_PATH( + source_id VARCHAR, + target_id VARCHAR +) +RETURNS VARIANT +SERVICE = {database}.{schema}.GRAPH_ANALYTICS_SERVICE +ENDPOINT = 'graph-api' +AS '/shortest_path'; + +-- 6. Grant usage +-- GRANT USAGE ON FUNCTION {database}.{schema}.GRAPH_CENTRALITY(VARCHAR, VARCHAR, INTEGER) TO ROLE ; +-- GRANT USAGE ON FUNCTION {database}.{schema}.GRAPH_COMMUNITY_DETECTION(FLOAT) TO ROLE ; +-- GRANT USAGE ON FUNCTION {database}.{schema}.GRAPH_SHORTEST_PATH(VARCHAR, VARCHAR) TO ROLE ; + +-- 7. Test +-- SELECT {database}.{schema}.GRAPH_CENTRALITY('pagerank', NULL, 5); +-- SELECT {database}.{schema}.GRAPH_COMMUNITY_DETECTION(1.0); +-- SELECT {database}.{schema}.GRAPH_SHORTEST_PATH('node_1', 'node_2'); +""" + + +def main(): + parser = argparse.ArgumentParser(description="Generate SPCS Graph Analytics Scaffolding") + parser.add_argument("--database", required=True) + parser.add_argument("--schema", required=True) + parser.add_argument("--output-dir", required=True) + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + database = args.database + schema = args.schema + + print(f"Generating SPCS graph analytics scaffolding for {database}.{schema}") + + service_path = output_dir / "spcs_graph_service.py" + service_path.write_text(generate_spcs_graph_service(database, schema)) + print(f" Wrote {service_path}") + + setup_path = output_dir / "spcs_setup.sql" + setup_path.write_text(generate_spcs_setup_sql(database, schema)) + print(f" Wrote {setup_path}") + + print(f"\nSPCS scaffolding generation complete") + + +if __name__ == "__main__": + main() diff --git a/skills/ontology-stack-builder/scripts/introspect_schema.py b/skills/ontology-stack-builder/scripts/introspect_schema.py new file mode 100644 index 00000000..9c395500 --- /dev/null +++ b/skills/ontology-stack-builder/scripts/introspect_schema.py @@ -0,0 +1,433 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pyyaml>=6.0", +# ] +# /// +""" +Schema Introspection Script - Analyzes Snowflake tables to propose ontology classes and relations. + +This is the schema-first discovery path (no OWL file). It reads table metadata +provided as JSON (pre-fetched via DESCRIBE TABLE / SHOW PRIMARY KEYS / SHOW IMPORTED KEYS) +and proposes ontology classes, relations, and mappings. + +Usage: + uv run --project python /scripts/introspect_schema.py \ + --metadata-json /tmp/table_metadata.json \ + --questions "Who are the top customers?|What products sell most?" \ + --output-dir /tmp/ontology_parsed + +The --metadata-json file is produced by the SKILL.md workflow which calls +DESCRIBE TABLE and SHOW KEYS via snowflake_sql_execute, then assembles the results. + +Expected metadata JSON format: +{ + "database": "MYDB", + "schema": "MYSCHEMA", + "tables": [ + { + "name": "CUSTOMERS", + "columns": [ + {"name": "CUSTOMER_ID", "type": "NUMBER", "nullable": false, "primary_key": true}, + {"name": "NAME", "type": "VARCHAR", "nullable": false}, + {"name": "EMAIL", "type": "VARCHAR", "nullable": true}, + ... + ], + "primary_keys": ["CUSTOMER_ID"], + "foreign_keys": [ + {"column": "REGION_ID", "ref_table": "REGIONS", "ref_column": "REGION_ID"} + ], + "row_count": 50000 + }, + ... + ] +} +""" + +import argparse +import json +import re +import sys +from collections import defaultdict +from pathlib import Path + + +# Heuristic patterns for identifying column roles +ID_PATTERNS = re.compile(r"^(.*_)?(ID|KEY|CODE|UUID|PK)$", re.IGNORECASE) +NAME_PATTERNS = re.compile(r"^(.*_)?(NAME|LABEL|TITLE|DISPLAY_NAME|FULL_NAME)$", re.IGNORECASE) +DESC_PATTERNS = re.compile(r"^(.*_)?(DESC|DESCRIPTION|NOTES|COMMENT|BIO)$", re.IGNORECASE) +TYPE_PATTERNS = re.compile(r"^(.*_)?(TYPE|CATEGORY|CLASS|STATUS|KIND|GROUP)$", re.IGNORECASE) +DATE_PATTERNS = re.compile(r"^(.*_)?(DATE|TIME|TIMESTAMP|CREATED|UPDATED|MODIFIED|_AT|_ON)$", re.IGNORECASE) +FK_SUFFIX = re.compile(r"^(.+)_(ID|KEY|CODE)$", re.IGNORECASE) + + +def classify_table(table: dict) -> str: + """Classify a table as 'entity', 'relationship', or 'lookup' based on structure.""" + cols = table["columns"] + col_names = [c["name"].upper() for c in cols] + pks = [pk.upper() for pk in table.get("primary_keys", [])] + fks = table.get("foreign_keys", []) + + # Junction/bridge table: composite PK with 2+ FKs, few non-FK columns + if len(pks) >= 2 and len(fks) >= 2: + non_fk_cols = [c for c in col_names if not any( + c == fk["column"].upper() for fk in fks + ) and c not in pks] + if len(non_fk_cols) <= 3: + return "relationship" + + # Lookup table: small column count, single PK, has name/desc column + if len(cols) <= 5 and len(pks) == 1: + has_name = any(NAME_PATTERNS.match(c) for c in col_names) + if has_name: + return "lookup" + + return "entity" + + +def find_column(columns: list[dict], pattern: re.Pattern) -> str | None: + """Find the first column matching a pattern.""" + for col in columns: + if pattern.match(col["name"]): + return col["name"] + return None + + +def infer_class_name(table_name: str) -> str: + """Convert a table name to an ontology class name. + + CUSTOMERS -> Customer, ORDER_ITEMS -> OrderItem, etc. + """ + # Remove common prefixes + name = table_name + for prefix in ("TBL_", "T_", "DIM_", "FACT_", "STG_", "RAW_"): + if name.upper().startswith(prefix): + name = name[len(prefix):] + break + + # Convert to PascalCase singular + parts = name.lower().split("_") + # Simple singularization: remove trailing 's' if word is >3 chars + singular_parts = [] + for p in parts: + if len(p) > 3 and p.endswith("s") and not p.endswith("ss"): + singular_parts.append(p[:-1]) + else: + singular_parts.append(p) + return "".join(w.capitalize() for w in singular_parts) + + +def propose_classes(tables: list[dict], database: str, schema: str) -> list[dict]: + """Propose ontology classes from table metadata.""" + classes = [] + + for table in tables: + tbl_type = classify_table(table) + if tbl_type == "relationship": + continue # Handled in propose_relations + + class_name = infer_class_name(table["name"]) + cols = table["columns"] + pks = table.get("primary_keys", []) + + # Find key columns + id_col = pks[0] if pks else find_column(cols, ID_PATTERNS) or cols[0]["name"] + name_col = find_column(cols, NAME_PATTERNS) + desc_col = find_column(cols, DESC_PATTERNS) + type_col = find_column(cols, TYPE_PATTERNS) + + is_abstract = tbl_type == "lookup" # Lookups often define type hierarchies + + # Collect all columns with type info for downstream property generation + all_columns = [ + { + "name": c["name"], + "data_type": c.get("type", "VARCHAR"), + "nullable": c.get("nullable", True), + "is_primary_key": c["name"] in pks, + "is_foreign_key": any( + c["name"].upper() == fk["column"].upper() + for fk in table.get("foreign_keys", []) + ), + } + for c in cols + ] + + classes.append({ + "name": class_name, + "label": class_name, + "description": f"Entity from table {table['name']}", + "parent_name": None, + "is_abstract": is_abstract, + "is_deprecated": False, + "namespace": f"{database}.{schema}", + "uri": f"urn:{database}:{schema}:{class_name}", + # Mapping info (carried through for convenience) + "_source_table": table["name"], + "_id_column": id_col, + "_name_column": name_col, + "_desc_column": desc_col, + "_type_column": type_col, + "_table_type": tbl_type, + "_columns": all_columns, + }) + + return classes + + +def propose_relations(tables: list[dict], classes: list[dict]) -> list[dict]: + """Propose ontology relations from FK relationships and junction tables.""" + relations = [] + class_by_table = {c["_source_table"]: c["name"] for c in classes} + + # 1. FK-based relations from entity tables + for table in tables: + tbl_type = classify_table(table) + if tbl_type == "relationship": + continue + + src_class = class_by_table.get(table["name"]) + if not src_class: + continue + + for fk in table.get("foreign_keys", []): + ref_table = fk["ref_table"] + # Handle fully qualified ref_table names + ref_simple = ref_table.split(".")[-1] if "." in ref_table else ref_table + dst_class = class_by_table.get(ref_simple) or class_by_table.get(ref_table) + if not dst_class: + continue + + # Derive relation name from FK column + fk_col = fk["column"] + match = FK_SUFFIX.match(fk_col) + if match: + rel_name = f"has_{match.group(1)}".lower().replace("__", "_") + else: + rel_name = f"belongs_to_{dst_class}".lower() + + relations.append({ + "name": rel_name, + "label": rel_name.replace("_", " ").title(), + "description": f"FK relationship from {table['name']}.{fk_col} to {ref_table}.{fk['ref_column']}", + "domain_class": src_class, + "domain_classes": [src_class], + "range_class": dst_class, + "range_classes": [dst_class], + "is_transitive": False, + "is_symmetric": False, + "is_functional": True, + "is_abstract": False, + "is_hierarchical": False, + "parent_name": None, + "inverse_name": None, + "cardinality": "N:1", + "uri": f"urn:rel:{rel_name}", + "_source_table": table["name"], + "_src_column": fk["column"], + "_dst_column": fk["ref_column"], + }) + + # 2. Junction table relations + for table in tables: + tbl_type = classify_table(table) + if tbl_type != "relationship": + continue + + fks = table.get("foreign_keys", []) + if len(fks) < 2: + continue + + # Create a relation between the first two FK targets + ref1 = fks[0]["ref_table"].split(".")[-1] + ref2 = fks[1]["ref_table"].split(".")[-1] + cls1 = class_by_table.get(ref1) + cls2 = class_by_table.get(ref2) + if not cls1 or not cls2: + continue + + rel_name = infer_class_name(table["name"]).lower() + + relations.append({ + "name": rel_name, + "label": rel_name.replace("_", " ").title(), + "description": f"Many-to-many relationship via junction table {table['name']}", + "domain_class": cls1, + "domain_classes": [cls1], + "range_class": cls2, + "range_classes": [cls2], + "is_transitive": False, + "is_symmetric": False, + "is_functional": False, + "is_abstract": False, + "is_hierarchical": False, + "parent_name": None, + "inverse_name": None, + "cardinality": "N:N", + "uri": f"urn:rel:{rel_name}", + "_source_table": table["name"], + "_src_column": fks[0]["column"], + "_dst_column": fks[1]["column"], + }) + + return relations + + +def propose_mappings(classes: list[dict], database: str, schema: str) -> list[dict]: + """Build class-to-table mapping entries.""" + mappings = [] + for cls in classes: + if cls.get("_table_type") == "relationship": + continue + mappings.append({ + "class_name": cls["name"], + "source_table": f"{database}.{schema}.{cls['_source_table']}", + "filter_condition": None, + "id_column": cls["_id_column"], + "name_column": cls.get("_name_column"), + "description_column": cls.get("_desc_column"), + "type_column": cls.get("_type_column"), + "props_column": None, + "columns": cls.get("_columns", []), + }) + return mappings + + +def propose_rel_mappings(relations: list[dict], tables: list[dict], database: str, schema: str) -> list[dict]: + """Build relation-to-table mapping entries.""" + table_by_name = {t["name"]: t for t in tables} + rel_mappings = [] + for rel in relations: + src_table_name = rel["_source_table"] + table_meta = table_by_name.get(src_table_name, {}) + # Collect non-FK/non-PK columns from relationship tables (e.g. WEIGHT, EFFECTIVE_START) + pks = set(pk.upper() for pk in table_meta.get("primary_keys", [])) + fk_cols = set(fk["column"].upper() for fk in table_meta.get("foreign_keys", [])) + extra_columns = [ + { + "name": c["name"], + "data_type": c.get("type", "VARCHAR"), + "nullable": c.get("nullable", True), + } + for c in table_meta.get("columns", []) + if c["name"].upper() not in pks + and c["name"].upper() not in fk_cols + and c["name"].upper() != rel["_src_column"].upper() + and c["name"].upper() != rel["_dst_column"].upper() + ] + rel_mappings.append({ + "rel_name": rel["name"], + "source_table": f"{database}.{schema}.{src_table_name}", + "filter_condition": None, + "src_column": rel["_src_column"], + "dst_column": rel["_dst_column"], + "props_column": None, + "columns": extra_columns, + }) + return rel_mappings + + +def compute_stats(classes: list, relations: list) -> dict: + """Compute summary statistics about the proposed ontology.""" + children_map = defaultdict(list) + roots = [] + for cls in classes: + if cls.get("parent_name"): + children_map[cls["parent_name"]].append(cls["name"]) + else: + roots.append(cls["name"]) + + return { + "total_classes": len(classes), + "abstract_classes": sum(1 for c in classes if c.get("is_abstract")), + "concrete_classes": sum(1 for c in classes if not c.get("is_abstract")), + "deprecated_classes": 0, + "root_classes": len(roots), + "max_hierarchy_depth": 0, + "total_relations": len(relations), + "hierarchical_relations": sum(1 for r in relations if r.get("is_hierarchical")), + "transitive_relations": sum(1 for r in relations if r.get("is_transitive")), + "total_individuals": 0, + "top_namespaces": {}, + } + + +def clean_for_output(items: list[dict]) -> list[dict]: + """Remove internal _ prefixed keys from output.""" + cleaned = [] + for item in items: + cleaned.append({k: v for k, v in item.items() if not k.startswith("_")}) + return cleaned + + +def main(): + parser = argparse.ArgumentParser(description="Introspect Snowflake schema and propose ontology") + parser.add_argument("--metadata-json", required=True, help="Path to table metadata JSON") + parser.add_argument("--questions", default="", help="Pipe-separated business questions") + parser.add_argument("--output-dir", required=True, help="Directory for output JSON files") + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Load metadata + with open(args.metadata_json) as f: + metadata = json.load(f) + + database = metadata["database"] + schema = metadata["schema"] + tables = metadata["tables"] + + print(f"Introspecting {len(tables)} tables in {database}.{schema}") + + # Propose ontology + classes = propose_classes(tables, database, schema) + relations = propose_relations(tables, classes) + class_mappings = propose_mappings(classes, database, schema) + rel_mappings = propose_rel_mappings(relations, tables, database, schema) + stats = compute_stats(classes, relations) + + # Questions (stored for downstream use in semantic model verified queries) + questions = [q.strip() for q in args.questions.split("|") if q.strip()] + + print(f" Proposed {len(classes)} classes, {len(relations)} relations") + print(f" {len(class_mappings)} class mappings, {len(rel_mappings)} relation mappings") + + # Write outputs (same format as parse_owl.py for downstream compatibility) + for name, data in [ + ("classes", clean_for_output(classes)), + ("relations", clean_for_output(relations)), + ("stats", stats), + ("individuals", []), # No individuals in schema-first path + ]: + out_path = output_dir / f"{name}.json" + with open(out_path, "w") as f: + json.dump(data, f, indent=2) + print(f" Wrote {out_path}") + + # Write mappings (additional output not in parse_owl.py) + mappings_data = { + "database": database, + "schema": schema, + "class_mappings": class_mappings, + "relation_mappings": rel_mappings, + "questions": questions, + } + mappings_path = output_dir / "mappings.json" + with open(mappings_path, "w") as f: + json.dump(mappings_data, f, indent=2) + print(f" Wrote {mappings_path}") + + # Print summary + print(f"\n=== Proposed Ontology ===") + print(f" Classes: {len(classes)}") + for cls in classes: + ttype = "abstract" if cls.get("is_abstract") else "concrete" + print(f" - {cls['name']} ({ttype}) <- {cls['_source_table']}") + print(f" Relations: {len(relations)}") + for rel in relations: + print(f" - {rel['domain_class']} --[{rel['name']}]--> {rel['range_class']}") + + +if __name__ == "__main__": + main() diff --git a/skills/ontology-stack-builder/scripts/parse_owl.py b/skills/ontology-stack-builder/scripts/parse_owl.py new file mode 100644 index 00000000..f45ef3f8 --- /dev/null +++ b/skills/ontology-stack-builder/scripts/parse_owl.py @@ -0,0 +1,339 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "rdflib>=7.0.0", +# ] +# /// +""" +OWL Ontology Parser - Extracts classes, relationships, and individuals from OWL/RDF files. + +Outputs structured JSON files that the ontology-semantic-modeler skill uses to generate +Snowflake metadata tables and semantic models. + +Usage: + uv run --script parse_owl.py -- --owl-file /path/to/ontology.owl --output-dir /tmp/parsed +""" + +import argparse +import json +import os +import sys +from collections import defaultdict +from pathlib import Path + +from rdflib import Graph, Namespace, RDF, RDFS, OWL, URIRef, BNode +from rdflib.namespace import SKOS, DC, DCTERMS + + +def parse_uri_to_name(uri: str) -> str: + """Extract a human-readable name from a URI.""" + if not uri: + return "" + uri_str = str(uri) + # Try fragment (#Name) + if "#" in uri_str: + return uri_str.split("#")[-1] + # Try last path segment + if "/" in uri_str: + return uri_str.split("/")[-1] + return uri_str + + +def get_label(graph: Graph, uri: URIRef) -> str: + """Get rdfs:label or skos:prefLabel for a URI, falling back to URI parsing.""" + for pred in [RDFS.label, SKOS.prefLabel]: + for obj in graph.objects(uri, pred): + return str(obj) + return parse_uri_to_name(uri) + + +def get_description(graph: Graph, uri: URIRef) -> str: + """Get description from common annotation properties.""" + for pred in [RDFS.comment, DC.description, DCTERMS.description, SKOS.definition]: + for obj in graph.objects(uri, pred): + return str(obj) + return "" + + +def extract_classes(graph: Graph) -> list[dict]: + """Extract all OWL classes with hierarchy information.""" + classes = [] + seen = set() + + # Find all OWL classes + class_uris = set() + for s in graph.subjects(RDF.type, OWL.Class): + if not isinstance(s, BNode): + class_uris.add(s) + for s in graph.subjects(RDF.type, RDFS.Class): + if not isinstance(s, BNode): + class_uris.add(s) + + # Also find classes referenced in subClassOf + for s, _, o in graph.triples((None, RDFS.subClassOf, None)): + if not isinstance(s, BNode): + class_uris.add(s) + if not isinstance(o, BNode): + class_uris.add(o) + + for cls_uri in sorted(class_uris, key=str): + uri_str = str(cls_uri) + if uri_str in seen: + continue + seen.add(uri_str) + + name = parse_uri_to_name(cls_uri) + label = get_label(graph, cls_uri) + description = get_description(graph, cls_uri) + + # Find parent classes (direct subClassOf, skip blank nodes like restrictions) + parents = [] + for parent in graph.objects(cls_uri, RDFS.subClassOf): + if not isinstance(parent, BNode): + parents.append(parse_uri_to_name(parent)) + + # Check if class is deprecated + deprecated = False + for obj in graph.objects(cls_uri, OWL.deprecated): + if str(obj).lower() in ("true", "1"): + deprecated = True + + # Determine if abstract (has subclasses but no direct instances defined) + has_subclasses = any(True for _ in graph.subjects(RDFS.subClassOf, cls_uri)) + has_instances = any(True for _ in graph.subjects(RDF.type, cls_uri)) + is_abstract = has_subclasses and not has_instances + + classes.append({ + "uri": uri_str, + "name": name, + "label": label, + "description": description, + "parent_names": parents, + "parent_name": parents[0] if parents else None, + "is_abstract": is_abstract, + "is_deprecated": deprecated, + "namespace": uri_str.rsplit("#", 1)[0] if "#" in uri_str else uri_str.rsplit("/", 1)[0], + }) + + return classes + + +def extract_relations(graph: Graph) -> list[dict]: + """Extract OWL object properties as relationship definitions.""" + relations = [] + seen = set() + + prop_uris = set() + for s in graph.subjects(RDF.type, OWL.ObjectProperty): + if not isinstance(s, BNode): + prop_uris.add(s) + for s in graph.subjects(RDF.type, RDF.Property): + if not isinstance(s, BNode): + prop_uris.add(s) + + for prop_uri in sorted(prop_uris, key=str): + uri_str = str(prop_uri) + if uri_str in seen: + continue + seen.add(uri_str) + + name = parse_uri_to_name(prop_uri) + label = get_label(graph, prop_uri) + description = get_description(graph, prop_uri) + + # Domain and range + domains = [parse_uri_to_name(d) for d in graph.objects(prop_uri, RDFS.domain) if not isinstance(d, BNode)] + ranges = [parse_uri_to_name(r) for r in graph.objects(prop_uri, RDFS.range) if not isinstance(r, BNode)] + + # Property characteristics + is_transitive = (prop_uri, RDF.type, OWL.TransitiveProperty) in graph + is_symmetric = (prop_uri, RDF.type, OWL.SymmetricProperty) in graph + is_functional = (prop_uri, RDF.type, OWL.FunctionalProperty) in graph + + # Inverse + inverses = [parse_uri_to_name(inv) for inv in graph.objects(prop_uri, OWL.inverseOf) if not isinstance(inv, BNode)] + + # subClassOf is always hierarchical + is_hierarchical = name.lower() in ("subclassof", "part_of", "has_part", "is_a") + + # Cardinality heuristic + cardinality = "N:N" + if is_functional: + cardinality = "N:1" + + relations.append({ + "uri": uri_str, + "name": name, + "label": label, + "description": description, + "domain_classes": domains, + "domain_class": domains[0] if domains else "Thing", + "range_classes": ranges, + "range_class": ranges[0] if ranges else "Thing", + "is_transitive": is_transitive, + "is_symmetric": is_symmetric, + "is_functional": is_functional, + "is_hierarchical": is_hierarchical, + "inverse_name": inverses[0] if inverses else None, + "cardinality": cardinality, + }) + + # Always ensure subClassOf is present (it's implicit in OWL) + if not any(r["name"] == "subClassOf" for r in relations): + relations.insert(0, { + "uri": str(RDFS.subClassOf), + "name": "subClassOf", + "label": "subClassOf", + "description": "Taxonomic subsumption (is-a hierarchy)", + "domain_classes": ["Thing"], + "domain_class": "Thing", + "range_classes": ["Thing"], + "range_class": "Thing", + "is_transitive": True, + "is_symmetric": False, + "is_functional": False, + "is_hierarchical": True, + "inverse_name": "hasSubClass", + "cardinality": "N:1", + }) + + return relations + + +def extract_individuals(graph: Graph) -> list[dict]: + """Extract named individuals (OWL instances).""" + individuals = [] + seen = set() + + for s in graph.subjects(RDF.type, OWL.NamedIndividual): + if isinstance(s, BNode): + continue + uri_str = str(s) + if uri_str in seen: + continue + seen.add(uri_str) + + name = parse_uri_to_name(s) + label = get_label(graph, s) + + # Find types (classes this individual belongs to) + types = [] + for t in graph.objects(s, RDF.type): + if t != OWL.NamedIndividual and not isinstance(t, BNode): + types.append(parse_uri_to_name(t)) + + individuals.append({ + "uri": uri_str, + "name": name, + "label": label, + "types": types, + }) + + return individuals + + +def compute_stats(classes: list, relations: list, individuals: list) -> dict: + """Compute summary statistics about the parsed ontology.""" + # Hierarchy depth via BFS + children_map = defaultdict(list) + roots = [] + for cls in classes: + if cls["parent_name"]: + children_map[cls["parent_name"]].append(cls["name"]) + else: + roots.append(cls["name"]) + + max_depth = 0 + queue = [(r, 0) for r in roots] + while queue: + node, depth = queue.pop(0) + max_depth = max(max_depth, depth) + for child in children_map.get(node, []): + queue.append((child, depth + 1)) + + # Namespace breakdown + namespaces = defaultdict(int) + for cls in classes: + namespaces[cls["namespace"]] += 1 + + return { + "total_classes": len(classes), + "abstract_classes": sum(1 for c in classes if c["is_abstract"]), + "concrete_classes": sum(1 for c in classes if not c["is_abstract"]), + "deprecated_classes": sum(1 for c in classes if c["is_deprecated"]), + "root_classes": len(roots), + "max_hierarchy_depth": max_depth, + "total_relations": len(relations), + "hierarchical_relations": sum(1 for r in relations if r["is_hierarchical"]), + "transitive_relations": sum(1 for r in relations if r["is_transitive"]), + "total_individuals": len(individuals), + "top_namespaces": dict(sorted(namespaces.items(), key=lambda x: -x[1])[:10]), + } + + +def main(): + parser = argparse.ArgumentParser(description="Parse OWL ontology files into structured JSON") + parser.add_argument("--owl-file", required=True, help="Path to OWL/RDF/TTL file") + parser.add_argument("--output-dir", required=True, help="Directory for JSON output files") + parser.add_argument("--format", default=None, help="RDF format hint (xml, turtle, n3, nt). Auto-detected if omitted.") + parser.add_argument("--exclude-deprecated", action="store_true", help="Exclude deprecated classes") + parser.add_argument("--namespace-filter", default=None, help="Only include classes from this namespace prefix") + args = parser.parse_args() + + owl_path = Path(args.owl_file) + if not owl_path.exists(): + print(f"ERROR: OWL file not found: {owl_path}", file=sys.stderr) + sys.exit(1) + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Parse the ontology + print(f"Parsing ontology: {owl_path}") + g = Graph() + + fmt = args.format + if fmt is None: + suffix = owl_path.suffix.lower() + fmt_map = {".owl": "xml", ".rdf": "xml", ".ttl": "turtle", ".n3": "n3", ".nt": "nt"} + fmt = fmt_map.get(suffix, "xml") + + g.parse(str(owl_path), format=fmt) + print(f" Loaded {len(g)} triples") + + # Extract components + classes = extract_classes(g) + relations = extract_relations(g) + individuals = extract_individuals(g) + + # Apply filters + if args.exclude_deprecated: + classes = [c for c in classes if not c["is_deprecated"]] + + if args.namespace_filter: + ns = args.namespace_filter + classes = [c for c in classes if c["namespace"].startswith(ns)] + + # Compute stats + stats = compute_stats(classes, relations, individuals) + + # Write outputs + for name, data in [("classes", classes), ("relations", relations), ("individuals", individuals), ("stats", stats)]: + out_path = output_dir / f"{name}.json" + with open(out_path, "w") as f: + json.dump(data, f, indent=2) + print(f" Wrote {out_path} ({len(data) if isinstance(data, list) else 'summary'} items)") + + # Print summary + print(f"\n=== Ontology Summary ===") + print(f" Classes: {stats['total_classes']} ({stats['abstract_classes']} abstract, {stats['concrete_classes']} concrete)") + print(f" Relations: {stats['total_relations']} ({stats['hierarchical_relations']} hierarchical)") + print(f" Individuals: {stats['total_individuals']}") + print(f" Max depth: {stats['max_hierarchy_depth']}") + print(f" Root classes: {stats['root_classes']}") + if stats['deprecated_classes']: + print(f" Deprecated: {stats['deprecated_classes']}") + + +if __name__ == "__main__": + main() diff --git a/skills/ontology-stack-builder/scripts/visualize_ontology.py b/skills/ontology-stack-builder/scripts/visualize_ontology.py new file mode 100644 index 00000000..80944e0e --- /dev/null +++ b/skills/ontology-stack-builder/scripts/visualize_ontology.py @@ -0,0 +1,1433 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "streamlit>=1.30.0", +# "streamlit-agraph>=0.0.45", +# "pyyaml>=6.0", +# ] +# /// +""" +Ontology Visualization App - Interactive exploration of parsed OWL ontology +and generated semantic model. + +Usage: + uv run --script visualize_ontology.py -- \ + --classes-json /tmp/parsed/classes.json \ + --relations-json /tmp/parsed/relations.json \ + --semantic-model /tmp/generated/03_ontology_semantic_model.yaml +""" + +import argparse +import copy +import json +import sys +from collections import defaultdict +from pathlib import Path + +import streamlit as st +import yaml +from streamlit_agraph import agraph, Node, Edge, Config + + +# --------------------------------------------------------------------------- +# Session-state helpers for mutable ontology editing +# --------------------------------------------------------------------------- + +def _init_session_state(classes: list[dict], relations: list[dict], + classes_path: str, relations_path: str) -> None: + """Deep-copy loaded data into st.session_state on first run.""" + if "classes" not in st.session_state: + st.session_state["classes"] = copy.deepcopy(classes) + if "relations" not in st.session_state: + st.session_state["relations"] = copy.deepcopy(relations) + if "classes_path" not in st.session_state: + st.session_state["classes_path"] = classes_path + if "relations_path" not in st.session_state: + st.session_state["relations_path"] = relations_path + # Track which names were added or modified this session + if "added_classes" not in st.session_state: + st.session_state["added_classes"] = set() + if "added_relations" not in st.session_state: + st.session_state["added_relations"] = set() + if "modified_classes" not in st.session_state: + st.session_state["modified_classes"] = set() + if "modified_relations" not in st.session_state: + st.session_state["modified_relations"] = set() + if "has_unsaved_changes" not in st.session_state: + st.session_state["has_unsaved_changes"] = False + + +def _mark_dirty() -> None: + st.session_state["has_unsaved_changes"] = True + + +def _save_to_disk() -> None: + """Write current session-state classes/relations back to JSON files.""" + with open(st.session_state["classes_path"], "w") as f: + json.dump(st.session_state["classes"], f, indent=2) + with open(st.session_state["relations_path"], "w") as f: + json.dump(st.session_state["relations"], f, indent=2) + st.session_state["has_unsaved_changes"] = False + + +def _undo_all(original_classes: list[dict], original_relations: list[dict]) -> None: + """Reset session state to the original loaded data.""" + st.session_state["classes"] = copy.deepcopy(original_classes) + st.session_state["relations"] = copy.deepcopy(original_relations) + st.session_state["added_classes"] = set() + st.session_state["added_relations"] = set() + st.session_state["modified_classes"] = set() + st.session_state["modified_relations"] = set() + st.session_state["has_unsaved_changes"] = False + + +# --------------------------------------------------------------------------- +# Sidebar editor widgets +# --------------------------------------------------------------------------- + +def _render_editor_sidebar(original_classes: list[dict], + original_relations: list[dict]) -> None: + """Render the ontology editor section in the sidebar.""" + classes = st.session_state["classes"] + relations = st.session_state["relations"] + class_names = sorted(c["name"] for c in classes) + + st.sidebar.divider() + st.sidebar.header("Ontology Editor") + + # --- Unsaved-changes indicator --- + if st.session_state["has_unsaved_changes"]: + st.sidebar.warning("You have unsaved changes.") + + # --- Save / Undo --- + col_save, col_undo = st.sidebar.columns(2) + with col_save: + if st.button("Save Changes", use_container_width=True, + disabled=not st.session_state["has_unsaved_changes"]): + _save_to_disk() + st.rerun() + with col_undo: + if st.button("Undo All", use_container_width=True, + disabled=not st.session_state["has_unsaved_changes"]): + _undo_all(original_classes, original_relations) + st.rerun() + + # --- Add Class --- + with st.sidebar.expander("Add Class"): + with st.form("add_class_form", clear_on_submit=True): + new_name = st.text_input("Class name (PascalCase)") + new_label = st.text_input("Label (display name)") + new_desc = st.text_input("Description") + parent_opts = ["(none)"] + class_names + new_parent = st.selectbox("Parent class", parent_opts) + new_abstract = st.checkbox("Abstract?") + submitted = st.form_submit_button("Add Class") + if submitted and new_name.strip(): + name = new_name.strip() + if any(c["name"] == name for c in classes): + st.error(f"Class '{name}' already exists.") + else: + cls_dict = { + "name": name, + "label": new_label.strip() or name, + "description": new_desc.strip(), + "parent_name": None if new_parent == "(none)" else new_parent, + "is_abstract": new_abstract, + "is_deprecated": False, + "namespace": "", + "uri": f"urn:user:{name}", + } + st.session_state["classes"].append(cls_dict) + st.session_state["added_classes"].add(name) + _mark_dirty() + st.rerun() + + # --- Add Relation --- + with st.sidebar.expander("Add Relation"): + with st.form("add_relation_form", clear_on_submit=True): + rel_name = st.text_input("Relation name") + src_class = st.selectbox("Source (domain) class", class_names, key="rel_src") + dst_class = st.selectbox("Target (range) class", class_names, key="rel_dst") + card_opts = ["many_to_many", "many_to_one", "one_to_many", "one_to_one"] + cardinality = st.selectbox("Cardinality", card_opts) + rel_desc = st.text_input("Description", key="rel_desc") + rel_submit = st.form_submit_button("Add Relation") + if rel_submit and rel_name.strip(): + rn = rel_name.strip() + if any(r["name"] == rn for r in relations): + st.error(f"Relation '{rn}' already exists.") + else: + rel_dict = { + "name": rn, + "domain_class": src_class, + "range_class": dst_class, + "cardinality": cardinality, + "description": rel_desc.strip(), + "is_hierarchical": False, + "is_transitive": False, + } + st.session_state["relations"].append(rel_dict) + st.session_state["added_relations"].add(rn) + _mark_dirty() + st.rerun() + + # --- Delete Class --- + with st.sidebar.expander("Delete Class"): + del_class = st.selectbox("Select class to delete", ["(select)"] + class_names, + key="del_class_sel") + if del_class != "(select)": + children = [c["name"] for c in classes if c.get("parent_name") == del_class] + involved = [r["name"] for r in relations + if r.get("domain_class") == del_class or r.get("range_class") == del_class] + if children: + st.caption(f"Children that will be re-parented to root: {', '.join(children)}") + if involved: + st.caption(f"Relations that will be removed: {', '.join(involved)}") + if st.button("Delete Class", key="del_class_btn"): + # Re-parent children + for c in st.session_state["classes"]: + if c.get("parent_name") == del_class: + c["parent_name"] = None + st.session_state["modified_classes"].add(c["name"]) + # Remove involved relations + st.session_state["relations"] = [ + r for r in st.session_state["relations"] + if r.get("domain_class") != del_class and r.get("range_class") != del_class + ] + # Remove the class + st.session_state["classes"] = [ + c for c in st.session_state["classes"] if c["name"] != del_class + ] + st.session_state["added_classes"].discard(del_class) + _mark_dirty() + st.rerun() + + # --- Delete Relation --- + with st.sidebar.expander("Delete Relation"): + rel_names = sorted(r["name"] for r in relations) + del_rel = st.selectbox("Select relation to delete", ["(select)"] + rel_names, + key="del_rel_sel") + if del_rel != "(select)": + if st.button("Delete Relation", key="del_rel_btn"): + st.session_state["relations"] = [ + r for r in st.session_state["relations"] if r["name"] != del_rel + ] + st.session_state["added_relations"].discard(del_rel) + _mark_dirty() + st.rerun() + + +# --------------------------------------------------------------------------- +# Inline edit form for the detail panel (Tab 2 right column) +# --------------------------------------------------------------------------- + +def _render_edit_class_form(cls_name: str) -> None: + """Render an inline edit form for a class in the detail panel.""" + classes = st.session_state["classes"] + cls = next((c for c in classes if c["name"] == cls_name), None) + if not cls: + return + + class_names = sorted(c["name"] for c in classes if c["name"] != cls_name) + + st.markdown("---") + st.markdown("#### Edit Class") + with st.form(f"edit_class_{cls_name}", clear_on_submit=False): + new_label = st.text_input("Label", value=cls.get("label", cls_name)) + new_desc = st.text_input("Description", value=cls.get("description", "")) + parent_opts = ["(none)"] + class_names + current_parent = cls.get("parent_name") + parent_idx = parent_opts.index(current_parent) if current_parent in parent_opts else 0 + new_parent = st.selectbox("Parent", parent_opts, index=parent_idx) + new_abstract = st.checkbox("Abstract?", value=cls.get("is_abstract", False)) + + col1, col2 = st.columns(2) + with col1: + save = st.form_submit_button("Save") + with col2: + cancel = st.form_submit_button("Cancel") + + if save: + cls["label"] = new_label.strip() or cls_name + cls["description"] = new_desc.strip() + cls["parent_name"] = None if new_parent == "(none)" else new_parent + cls["is_abstract"] = new_abstract + st.session_state["modified_classes"].add(cls_name) + _mark_dirty() + st.rerun() + if cancel: + st.rerun() + + +def load_json(path: str) -> list | dict: + with open(path) as f: + return json.load(f) + + +def load_yaml(path: str) -> dict: + with open(path) as f: + return yaml.safe_load(f) + + +def build_tree(classes: list[dict]) -> dict: + """Build a parent->children tree structure.""" + children_map = defaultdict(list) + roots = [] + name_to_cls = {} + for cls in classes: + name_to_cls[cls["name"]] = cls + parent = cls.get("parent_name") + if parent: + children_map[parent].append(cls["name"]) + else: + roots.append(cls["name"]) + return children_map, roots, name_to_cls + + +def build_relation_tree(relations: list[dict]) -> tuple[dict, list, list, dict]: + """Build a parent->children tree structure for relations. + + Returns (rel_children_map, rel_roots, flat_concrete, rel_by_name). + - rel_roots: abstract relations that are tree roots (or concrete parents) + - flat_concrete: concrete relations with no parent and no children + """ + rel_by_name = {r["name"]: r for r in relations if r["name"] != "subClassOf"} + rel_children_map: dict[str, list[str]] = defaultdict(list) + has_parent = set() + + for rel in relations: + if rel["name"] == "subClassOf": + continue + parent_rel = rel.get("parent_name") + if parent_rel and parent_rel in rel_by_name: + rel_children_map[parent_rel].append(rel["name"]) + has_parent.add(rel["name"]) + + # Roots: abstract relations without a parent, or concrete relations that are parents + rel_roots = [] + for name, r in rel_by_name.items(): + if name in has_parent: + continue + if r.get("is_abstract") or name in rel_children_map: + rel_roots.append(name) + + # Flat concrete: no parent, not abstract, not a parent of anything + flat_concrete = [ + name for name, r in rel_by_name.items() + if name not in has_parent + and not r.get("is_abstract") + and name not in rel_children_map + ] + + return rel_children_map, rel_roots, flat_concrete, rel_by_name + + +def render_tree_text(name: str, children_map: dict, name_to_cls: dict, depth: int = 0, max_depth: int = 6) -> str: + """Render a text-based tree.""" + if depth > max_depth: + return "" + cls = name_to_cls.get(name, {}) + prefix = " " * depth + ("|- " if depth > 0 else "") + label = cls.get("label", name) + abstract_tag = " [abstract]" if cls.get("is_abstract") else "" + line = f"{prefix}{label}{abstract_tag}\n" + for child in sorted(children_map.get(name, [])): + line += render_tree_text(child, children_map, name_to_cls, depth + 1, max_depth) + return line + + +def render_interactive_tree(name: str, children_map: dict, name_to_cls: dict, + depth: int = 0, max_depth: int = 6, search: str = "") -> None: + """Render an interactive Streamlit tree with colored nodes and descriptions.""" + if depth > max_depth: + return + cls = name_to_cls.get(name, {}) + label = cls.get("label", name) + is_abstract = cls.get("is_abstract", False) + description = cls.get("description", "") + kids = sorted(children_map.get(name, [])) + has_children = len(kids) > 0 + + # Build the display label with icon and styling + if is_abstract: + icon = "🔹" + display = f"{icon} **{label}** `abstract`" + else: + icon = "🟢" + display = f"{icon} {label}" + + # Highlight search matches + if search and search.lower() in label.lower(): + display += " 🔍" + + indent_px = depth * 24 + + if has_children: + # Use indented markdown (no expanders — parent context already uses one) + st.markdown( + f'
{display}
', + unsafe_allow_html=True + ) + if description: + st.markdown( + f'
{description}
', + unsafe_allow_html=True + ) + for child in kids: + render_interactive_tree(child, children_map, name_to_cls, + depth + 1, max_depth, search) + else: + # Leaf node: render inline with indentation via markdown + st.markdown( + f'
{display}
', + unsafe_allow_html=True + ) + if description and search and search.lower() in label.lower(): + st.markdown( + f'
{description}
', + unsafe_allow_html=True + ) + + +def render_rel_tree_text(name: str, rel_children_map: dict, rel_by_name: dict, + depth: int = 0, max_depth: int = 6) -> str: + """Render a text-based tree for relations.""" + if depth > max_depth: + return "" + r = rel_by_name.get(name, {}) + prefix = " " * depth + ("|- " if depth > 0 else "") + label = r.get("label", name) + is_abs = r.get("is_abstract", False) + abstract_tag = " [abstract]" if is_abs else "" + domain = r.get("domain_class", "?") + rng = r.get("range_class", "?") + line = f"{prefix}{label}{abstract_tag} ({domain} -> {rng})\n" + for child in sorted(rel_children_map.get(name, [])): + line += render_rel_tree_text(child, rel_children_map, rel_by_name, depth + 1, max_depth) + return line + + +def render_interactive_rel_tree(name: str, rel_children_map: dict, rel_by_name: dict, + depth: int = 0, max_depth: int = 6) -> None: + """Render an interactive Streamlit tree for relations with expanders.""" + if depth > max_depth: + return + r = rel_by_name.get(name, {}) + label = r.get("label", name) + is_abstract = r.get("is_abstract", False) + domain = r.get("domain_class", "?") + rng = r.get("range_class", "?") + description = r.get("description", "") + kids = sorted(rel_children_map.get(name, [])) + has_children = len(kids) > 0 + + if is_abstract: + icon = "🔷" + display = f"{icon} **{label}** `abstract` ({domain} -> {rng})" + else: + icon = "🟣" + display = f"{icon} {label} ({domain} -> {rng})" + + indent_px = depth * 24 + + if has_children: + # Use indented markdown (no expanders — parent context already uses one) + st.markdown( + f'
{display}
', + unsafe_allow_html=True + ) + if description: + st.markdown( + f'
{description}
', + unsafe_allow_html=True + ) + cardinality = r.get("cardinality", "") + if cardinality: + st.markdown( + f'
Cardinality: {cardinality}
', + unsafe_allow_html=True + ) + for child in kids: + render_interactive_rel_tree(child, rel_children_map, rel_by_name, + depth + 1, max_depth) + else: + st.markdown( + f'
{display}
', + unsafe_allow_html=True + ) + if description: + st.markdown( + f'
{description}
', + unsafe_allow_html=True + ) + + +def build_coverage_map(classes: list[dict], sem_model: dict | None, + deployed_objects: dict | None = None) -> dict[str, dict]: + """Classify each OWL class by its coverage status. + + Coverage can be determined from either a semantic model YAML (Phase 5+) + or a deployed-objects manifest (Phase 4 post-deployment). + + Returns a dict keyed by class name with values: + status: 'mapped' | 'covered' | 'unmapped' | 'abstract' + view_name: the view name (only for 'mapped') + covering_ancestor: label of the ancestor whose view covers this class (only for 'covered') + sf_objects: list of Snowflake object names for this class (when deployed_objects provided) + """ + result: dict[str, dict] = {} + + # --- Path 1: Deployed-objects manifest (Phase 4 post-deployment) --- + if deployed_objects and not sem_model: + class_to_objects = deployed_objects.get("class_to_objects", {}) + name_to_cls = {c["name"]: c for c in classes} + + def _has_view(cls_name: str) -> list[str]: + """Return list of view names for this class from the manifest.""" + entry = class_to_objects.get(cls_name, {}) + return entry.get("views", []) + + def _find_deployed_ancestor(cls: dict) -> tuple[str, list[str]] | None: + """Walk up the hierarchy to find an ancestor with deployed views.""" + visited = set() + current = cls.get("parent_name") + while current and current not in visited: + visited.add(current) + parent_cls = name_to_cls.get(current) + if parent_cls: + views = _has_view(current) + if views: + return (parent_cls.get("label", current), views) + current = parent_cls.get("parent_name") + else: + break + return None + + for cls in classes: + name = cls["name"] + views = _has_view(name) + entry = class_to_objects.get(name, {}) + sf_objs = views + ([entry["metadata_row"]] if entry.get("metadata_row") else []) + + if views: + result[name] = { + "status": "mapped", + "view_name": views[0], + "sf_objects": sf_objs, + } + elif cls.get("is_abstract"): + result[name] = {"status": "abstract", "sf_objects": sf_objs} + else: + ancestor = _find_deployed_ancestor(cls) + if ancestor: + result[name] = { + "status": "covered", + "covering_ancestor": ancestor[0], + "covering_view": ancestor[1][0], + "sf_objects": sf_objs, + } + else: + result[name] = {"status": "unmapped", "sf_objects": sf_objs} + + return result + + # --- Path 2: No coverage source available (Phase 3 pre-deployment) --- + if not sem_model: + for cls in classes: + result[cls["name"]] = {"status": "abstract" if cls.get("is_abstract") else "unmapped"} + return result + + # --- Path 3: Semantic model YAML (Phase 5+) --- + tables_in_model = sem_model.get("tables", []) + # Build suffix -> full view name mapping + suffix_to_view: dict[str, str] = {} + for tbl in tables_in_model: + tname = tbl["name"] + upper = tname.upper() + if "VW_ONT_" in upper: + suffix = upper.split("VW_ONT_", 1)[1] + suffix_to_view[suffix] = tname + + name_to_cls = {c["name"]: c for c in classes} + + def get_matched_view(cls: dict) -> str | None: + name_upper = cls["name"].upper() + for suffix, view_name in suffix_to_view.items(): + if suffix == name_upper or suffix in name_upper or name_upper in suffix: + return view_name + return None + + def find_covering_ancestor(cls: dict) -> tuple[str, str] | None: + """Returns (ancestor_label, ancestor_view_name) or None.""" + visited = set() + current = cls.get("parent_name") + while current and current not in visited: + visited.add(current) + parent_cls = name_to_cls.get(current) + if parent_cls: + view = get_matched_view(parent_cls) + if view: + return (parent_cls.get("label", current), view) + current = parent_cls.get("parent_name") + else: + break + return None + + for cls in classes: + view = get_matched_view(cls) + if view: + result[cls["name"]] = {"status": "mapped", "view_name": view} + elif cls.get("is_abstract"): + result[cls["name"]] = {"status": "abstract"} + else: + ancestor = find_covering_ancestor(cls) + if ancestor: + result[cls["name"]] = { + "status": "covered", + "covering_ancestor": ancestor[0], + "covering_view": ancestor[1], + } + else: + result[cls["name"]] = {"status": "unmapped"} + + return result + + +def build_agraph_nodes_edges(classes: list[dict], relations: list[dict], + coverage_map: dict[str, dict], + max_nodes: int = 100, + show_relations: bool = True, + added_classes: set | None = None, + modified_classes: set | None = None, + added_relations: set | None = None, + layer_filter: str = "all") -> tuple[list, list]: + """Build streamlit-agraph Node and Edge lists with coverage coloring. + + Nodes that were added this session get a gold border; modified nodes get + an orange border. Similarly, added relation edges are drawn in gold. + + layer_filter controls visibility: + - "all": show everything (abstract items rendered distinctly) + - "concrete": hide abstract classes and abstract relations + - "abstract": show only abstract classes and abstract relations + """ + added_classes = added_classes or set() + modified_classes = modified_classes or set() + added_relations = added_relations or set() + + # Filter classes based on layer + if layer_filter == "concrete": + classes = [c for c in classes if not c.get("is_abstract")] + elif layer_filter == "abstract": + classes = [c for c in classes if c.get("is_abstract")] + + # Color scheme matching coverage status + colors = { + "mapped": {"background": "#2ecc71", "border": "#27ae60", "font": "#ffffff"}, + "covered": {"background": "#3498db", "border": "#2980b9", "font": "#ffffff"}, + "unmapped": {"background": "#e74c3c", "border": "#c0392b", "font": "#ffffff"}, + "abstract": {"background": "#ecf0f1", "border": "#95a5a6", "font": "#2c3e50"}, + } + + name_to_cls = {c["name"]: c for c in classes} + nodes = [] + edges = [] + displayed = set() + + for cls in classes[:max_nodes]: + name = cls["name"] + label = cls.get("label", name) + cov = coverage_map.get(name, {"status": "unmapped"}) + status = cov["status"] + c = colors[status] + + # Override border color for added / modified nodes + if name in added_classes: + border_color = "#f1c40f" # gold + border_width = 4 + elif name in modified_classes: + border_color = "#e67e22" # orange + border_width = 4 + else: + border_color = c["border"] + border_width = 2 + + # Build hover tooltip + tooltip_parts = [f"{label}"] + if name in added_classes: + tooltip_parts.append("NEW") + elif name in modified_classes: + tooltip_parts.append("MODIFIED") + if cls.get("is_abstract"): + tooltip_parts.append("Abstract class") + if cls.get("description"): + tooltip_parts.append(cls["description"][:150]) + tooltip_parts.append(f"
Coverage: {status.upper()}") + if status == "mapped": + vn = cov.get("view_name", "") + short = vn + if "VW_ONT_" in vn.upper(): + short = "VW_ONT_" + vn.upper().split("VW_ONT_", 1)[1] + tooltip_parts.append(f"View: {short}") + elif status == "covered": + tooltip_parts.append(f"Covered via: {cov.get('covering_ancestor', '?')}") + tooltip = "
".join(tooltip_parts) + + # Node shape: diamond for abstract, box for concrete + is_abs = cls.get("is_abstract", False) + # Diamond labels render BELOW the shape on the white page background, + # so always use dark font for diamonds regardless of coverage status + font_color = "#2c3e50" if is_abs else c["font"] + + nodes.append(Node( + id=name, + label=label, + title=tooltip, + color={"background": c["background"], "border": border_color}, + shape="diamond" if is_abs else "box", + size=25, + font={"color": font_color, "size": 12}, + borderWidth=border_width, + borderWidthSelected=3, + )) + displayed.add(name) + + # subClassOf edges (hierarchy) + for cls in classes[:max_nodes]: + parent = cls.get("parent_name") + if parent and cls["name"] in displayed: + if parent not in displayed: + # Add missing parent as a ghost node + parent_cls = name_to_cls.get(parent, {}) + nodes.append(Node( + id=parent, + label=parent_cls.get("label", parent), + title=f"{parent_cls.get('label', parent)}
Not in visible set", + color={"background": "#ecf0f1", "border": "#bdc3c7"}, + shape="diamond", + size=25, + font={"color": "#95a5a6", "size": 12}, + borderWidth=2, + )) + displayed.add(parent) + edges.append(Edge( + source=cls["name"], + target=parent, + color="#5dade2", + label="subClassOf", + width=1.5, + arrows="to", + dashes=False, + font={"size": 8, "color": "#5dade2", "strokeWidth": 0}, + )) + + # Other relation edges + if show_relations: + rel_name_to_rel = {r["name"]: r for r in relations} + for rel in relations: + if rel["name"] == "subClassOf": + continue + is_abstract_rel = rel.get("is_abstract", False) + + # Layer filtering for relations + if layer_filter == "concrete" and is_abstract_rel: + continue + if layer_filter == "abstract" and not is_abstract_rel: + continue + + domain = rel.get("domain_class", "") + rng = rel.get("range_class", "") + if domain in displayed and rng in displayed: + if rel["name"] in added_relations: + edge_color = "#f1c40f" # gold for newly added + edge_width = 2.0 + elif is_abstract_rel: + edge_color = "#8e44ad" # deep purple for abstract relations + edge_width = 2.5 + elif rel.get("is_hierarchical"): + edge_color = "#e67e22" + edge_width = 1.0 + else: + edge_color = "#9b59b6" + edge_width = 1.0 + edges.append(Edge( + source=domain, + target=rng, + color=edge_color, + label=rel["name"], + width=edge_width, + arrows="to", + dashes=True if not is_abstract_rel else False, + font={"size": 8, "color": edge_color, "strokeWidth": 0}, + )) + + # Relation hierarchy edges: child relation → parent relation + # Rendered as thin dotted lines between the midpoints of related edges + if layer_filter in ("all", "abstract"): + for rel in relations: + parent_rel_name = rel.get("parent_name") + if not parent_rel_name: + continue + parent_rel = rel_name_to_rel.get(parent_rel_name) + if not parent_rel: + continue + # Show relation hierarchy by connecting child domain → parent domain + # only if both endpoints are visible + child_domain = rel.get("domain_class", "") + parent_domain = parent_rel.get("domain_class", "") + if child_domain in displayed and parent_domain in displayed: + edges.append(Edge( + source=child_domain, + target=parent_domain, + color="#d2b4de", # light purple + label=f"specializes ({rel['name']} → {parent_rel_name})", + width=1.0, + arrows="to", + dashes=[2, 4], # dotted + font={"size": 7, "color": "#d2b4de", "strokeWidth": 0}, + )) + + return nodes, edges + + +def render_node_detail(cls_name: str, classes: list[dict], relations: list[dict], + coverage_map: dict[str, dict], sem_model: dict | None, + deployed_objects: dict | None = None) -> None: + """Render the detail inspector panel for a clicked node.""" + name_to_cls = {c["name"]: c for c in classes} + cls = name_to_cls.get(cls_name) + if not cls: + st.warning(f"Class '{cls_name}' not found.") + return + + label = cls.get("label", cls_name) + cov = coverage_map.get(cls_name, {"status": "unmapped"}) + status = cov["status"] + + # --- Header with coverage badge --- + badge_colors = { + "mapped": "#2ecc71", "covered": "#3498db", + "unmapped": "#e74c3c", "abstract": "#95a5a6", + } + badge_labels = { + "mapped": "MAPPED TO VIEW", "covered": "COVERED AS ROWS", + "unmapped": "UNMAPPED", "abstract": "ABSTRACT", + } + bc = badge_colors.get(status, "#95a5a6") + bl = badge_labels.get(status, status.upper()) + st.markdown( + f'

{label}

' + f'{bl}', + unsafe_allow_html=True, + ) + + # Description + if cls.get("description"): + st.caption(cls["description"]) + + st.divider() + + # --- Class info --- + type_label = "Abstract" if cls.get("is_abstract") else "Concrete" + st.markdown(f"**Type:** {type_label}") + if cls.get("parent_name"): + parent_cls = name_to_cls.get(cls["parent_name"], {}) + st.markdown(f"**Parent:** {parent_cls.get('label', cls['parent_name'])}") + + # Children + children = [c for c in classes if c.get("parent_name") == cls_name] + if children: + child_labels = ", ".join(sorted(c.get("label", c["name"]) for c in children)) + st.markdown(f"**Children ({len(children)}):** {child_labels}") + + st.divider() + + # --- Snowflake Implementation --- + st.markdown("#### Snowflake Implementation") + + # Show all deployed Snowflake objects for this class (from manifest) + sf_objs = cov.get("sf_objects", []) + if sf_objs: + st.markdown("**Deployed objects:**") + for obj in sf_objs: + st.write(f"- `{obj}`") + + # Find the subClassOf edge table in semantic model (used by multiple sections) + subclass_table_info = None + if sem_model: + for tbl in sem_model.get("tables", []): + if "SUBCLASS" in tbl["name"].upper(): + bt = tbl.get("base_table", {}) + subclass_table_info = { + "name": tbl["name"], + "fqn": f"{bt.get('database', '')}.{bt.get('schema', '')}.{bt.get('table', '')}", + "dims": tbl.get("dimensions", []), + } + break + + if status == "mapped": + view_name = cov.get("view_name", "") + short = view_name + if "VW_ONT_" in view_name.upper(): + short = "VW_ONT_" + view_name.upper().split("VW_ONT_", 1)[1] + st.markdown(f"**View:** `{short}`") + + # Find the table definition in the semantic model + if sem_model: + for tbl in sem_model.get("tables", []): + if tbl["name"].upper() == view_name.upper(): + bt = tbl.get("base_table", {}) + fqn = f"{bt.get('database', '')}.{bt.get('schema', '')}.{bt.get('table', '')}" + st.markdown(f"**Base table:** `{fqn}`") + + dims = tbl.get("dimensions", []) + if dims: + st.markdown("**Dimensions:**") + for d in dims: + desc = f" — {d['description']}" if d.get("description") else "" + st.markdown(f"- `{d['name']}` ({d.get('data_type', 'VARCHAR')}){desc}") + + facts = tbl.get("facts", []) + if facts: + st.markdown("**Facts:**") + for f_item in facts: + st.markdown(f"- `{f_item['name']}` ({f_item.get('data_type', 'FLOAT')})") + break + + elif status == "covered": + ancestor = cov.get("covering_ancestor", "?") + covering_view = cov.get("covering_view", "") + short_cv = covering_view + if "VW_ONT_" in covering_view.upper(): + short_cv = "VW_ONT_" + covering_view.upper().split("VW_ONT_", 1)[1] + st.markdown(f"**Covered by ancestor:** {ancestor}") + st.markdown(f"**Ancestor's view:** `{short_cv}`") + st.info(f"Rows for *{label}* exist as typed rows (ENTITY_TYPE column) within the ancestor's view.") + + elif status == "unmapped": + st.error("No view or ancestor view covers this class.") + st.caption("To add coverage, create a mapping in the ontology mappings file.") + + elif status == "abstract": + st.markdown("**No dedicated view** — abstract classes are structural groupings.") + if subclass_table_info: + st.markdown(f"Appears as parent/child references in the hierarchy edge table:") + st.markdown(f"- `{subclass_table_info['fqn']}`") + if children: + st.info(f"This class organizes {len(children)} child classes. " + "It exists in the hierarchy edge table as PARENT_NAME values.") + else: + st.info("Structural node with no children — leaf abstract class.") + + # --- Hierarchy edge (subClassOf) implementation --- + if cls.get("parent_name") and subclass_table_info: + st.divider() + st.markdown("#### Hierarchy Edge (subClassOf)") + st.markdown(f"**Edge table:** `{subclass_table_info['fqn']}`") + parent_label = name_to_cls.get(cls["parent_name"], {}).get("label", cls["parent_name"]) + st.code( + f"-- Row in edge table:\n" + f"CHILD_NAME = '{label}'\n" + f"PARENT_NAME = '{parent_label}'\n" + f"REL_TYPE = 'subClassOf'", + language="sql", + ) + dim_names = [d["name"] for d in subclass_table_info.get("dims", [])] + if dim_names: + st.caption(f"Columns: {', '.join(dim_names)}") + + # --- Relations involving this class --- + involved_rels = [r for r in relations + if r.get("domain_class") == cls_name or r.get("range_class") == cls_name] + if involved_rels: + st.divider() + st.markdown("#### Relations") + for r in involved_rels: + direction = "domain" if r.get("domain_class") == cls_name else "range" + other = r.get("range_class") if direction == "domain" else r.get("domain_class") + other_label = name_to_cls.get(other, {}).get("label", other) + arrow = f"{label} —[{r['name']}]→ {other_label}" if direction == "domain" else f"{other_label} —[{r['name']}]→ {label}" + st.markdown(f"- {arrow}") + if r.get("description"): + st.caption(f" {r['description'][:120]}") + + # Show edge table if in semantic model + if sem_model: + if r["name"] == "subClassOf" and subclass_table_info: + st.markdown(f" Edge table: `{subclass_table_info['fqn']}`") + elif r["name"] != "subClassOf": + for tbl in sem_model.get("tables", []): + tname_upper = tbl["name"].upper() + rel_upper = r["name"].upper().replace(" ", "_") + if rel_upper in tname_upper and "VW_ONT_" in tname_upper: + bt = tbl.get("base_table", {}) + fqn = f"{bt.get('database','')}.{bt.get('schema','')}.{bt.get('table','')}" + st.markdown(f" Edge table: `{fqn}`") + break + + +def render_default_detail(classes: list[dict], relations: list[dict], + coverage_map: dict[str, dict], sem_model: dict | None, + deployed_objects: dict | None = None) -> None: + """Render the default detail panel when no node is selected.""" + st.markdown("#### Click a node to inspect") + st.caption("Click any node in the graph to see its Snowflake implementation details.") + + st.divider() + + # Quick stats + status_counts = defaultdict(int) + for cov in coverage_map.values(): + status_counts[cov["status"]] += 1 + + cols = st.columns(4) + cols[0].metric("Mapped", status_counts.get("mapped", 0)) + cols[1].metric("Covered", status_counts.get("covered", 0)) + cols[2].metric("Unmapped", status_counts.get("unmapped", 0)) + cols[3].metric("Abstract", status_counts.get("abstract", 0)) + + # Deployed objects summary (when manifest available) + if deployed_objects: + st.divider() + st.markdown("#### Snowflake Artifacts") + db = deployed_objects.get("database", "") + schema = deployed_objects.get("schema", "") + if db and schema: + st.caption(f"Target: **{db}.{schema}**") + n_views = len(deployed_objects.get("views", [])) + n_tables = len(deployed_objects.get("tables", [])) + n_procs = len(deployed_objects.get("procedures", [])) + n_udfs = len(deployed_objects.get("udfs", [])) + st.write(f"- **{n_views}** views, **{n_tables}** tables") + st.write(f"- **{n_procs}** procedures, **{n_udfs}** UDFs") + + # Relations summary + if relations: + st.divider() + st.markdown("#### Relations") + for r in relations: + props = [] + if r.get("is_hierarchical"): + props.append("hierarchical") + if r.get("is_transitive"): + props.append("transitive") + prop_str = f" ({', '.join(props)})" if props else "" + domain = r.get("domain_class", "?") + rng = r.get("range_class", "?") + st.markdown(f"- **{r['name']}**: {domain} → {rng}{prop_str}") + + # Semantic model summary + if sem_model: + st.divider() + st.markdown("#### Semantic Model") + st.markdown(f"**{sem_model.get('name', 'Unknown')}**") + for tbl in sem_model.get("tables", []): + short = tbl["name"] + if "VW_ONT_" in short.upper(): + short = "VW_ONT_" + short.upper().split("VW_ONT_", 1)[1] + ndims = len(tbl.get("dimensions", [])) + nfacts = len(tbl.get("facts", [])) + st.markdown(f"- `{short}` — {ndims} dims, {nfacts} facts") + + vqs = sem_model.get("verified_queries", []) + if vqs: + st.divider() + st.markdown(f"#### Verified Queries ({len(vqs)})") + for vq in vqs: + with st.expander(f"Q: {vq.get('question', vq.get('name', ''))}"): + st.code(vq.get("sql", ""), language="sql") + + +def main(): + # Parse args before Streamlit takes over + # Use sys.argv to find our custom args (after --) + custom_args = [] + if "--" in sys.argv: + idx = sys.argv.index("--") + custom_args = sys.argv[idx + 1:] + else: + # Try parsing directly + custom_args = sys.argv[1:] + + parser = argparse.ArgumentParser() + parser.add_argument("--classes-json", required=True) + parser.add_argument("--relations-json", required=True) + parser.add_argument("--semantic-model", default=None) + parser.add_argument("--deployed-objects", default=None, + help="JSON manifest mapping ontology concepts to deployed Snowflake objects") + parser.add_argument("--stats-json", default=None) + parser.add_argument("--port", default="8501") + args, _ = parser.parse_known_args(custom_args) + + # Load data from disk (the "original" baseline for undo) + original_classes = load_json(args.classes_json) + original_relations = load_json(args.relations_json) + stats = load_json(args.stats_json) if args.stats_json and Path(args.stats_json).exists() else None + sem_model = load_yaml(args.semantic_model) if args.semantic_model and Path(args.semantic_model).exists() else None + deployed_objects = load_json(args.deployed_objects) if args.deployed_objects and Path(args.deployed_objects).exists() else None + + # --- Streamlit App --- + st.set_page_config(page_title="Ontology Editor", layout="wide") + st.title("Ontology Semantic Modeler - Visualize & Edit") + + # Initialize mutable session state + _init_session_state(original_classes, original_relations, + args.classes_json, args.relations_json) + + # Live working copies from session state + classes = st.session_state["classes"] + relations = st.session_state["relations"] + + # Sidebar: Summary stats + st.sidebar.header("Ontology Summary") + if stats: + st.sidebar.metric("Total Classes", stats["total_classes"]) + st.sidebar.metric("Abstract", stats["abstract_classes"]) + st.sidebar.metric("Concrete", stats["concrete_classes"]) + st.sidebar.metric("Relations", stats["total_relations"]) + st.sidebar.metric("Max Depth", stats["max_hierarchy_depth"]) + if stats.get("total_individuals"): + st.sidebar.metric("Individuals", stats["total_individuals"]) + else: + st.sidebar.metric("Classes", len(classes)) + st.sidebar.metric("Relations", len(relations)) + + # Sidebar: Ontology Editor (add/delete/save/undo) + _render_editor_sidebar(original_classes, original_relations) + + # Tab layout (3 tabs: Hierarchy, Graph, Coverage) + tab_tree, tab_graph, tab_coverage = st.tabs([ + "Hierarchy", "Ontology Graph", "Coverage" + ]) + + # Build coverage map once (shared across tabs) + coverage_map = build_coverage_map(classes, sem_model, deployed_objects) + + # --- Tab 1: Hierarchy (Classes + Relations) --- + with tab_tree: + st.header("Hierarchy") + + # ── Class Hierarchy section ── + with st.expander("Class Hierarchy", expanded=True): + children_map, roots, name_to_cls = build_tree(classes) + + col_ctrl1, col_ctrl2 = st.columns([1, 1]) + with col_ctrl1: + max_depth = st.slider("Max display depth", 1, 15, 6, key="cls_depth") + with col_ctrl2: + cls_view_mode = st.radio("View", ["Interactive", "Text"], + horizontal=True, key="cls_view") + + # Search + search = st.text_input("Search classes", "", placeholder="Type to filter...") + + # Legend + st.markdown( + "🔹 = abstract (no data rows)    🟢 = concrete (has data)    🔍 = search match", + unsafe_allow_html=True + ) + st.divider() + + if search: + matches = [c for c in classes if search.lower() in c.get("label", "").lower() or search.lower() in c["name"].lower()] + st.write(f"**{len(matches)} matches** for \"{search}\":") + for m in matches[:50]: + path_parts = [] + current = m.get("parent_name") + while current: + path_parts.insert(0, name_to_cls.get(current, {}).get("label", current)) + current = name_to_cls.get(current, {}).get("parent_name") + path_str = " → ".join(path_parts) if path_parts else "root" + + icon = "🔹" if m.get("is_abstract") else "🟢" + st.markdown(f"{icon} **{m.get('label', m['name'])}**") + st.caption(f"Path: {path_str}") + if m.get("description"): + st.caption(m["description"][:200]) + elif cls_view_mode == "Interactive": + for root in sorted(roots): + render_interactive_tree(root, children_map, name_to_cls, + max_depth=max_depth, search=search) + else: + tree_text = "" + for root in sorted(roots): + tree_text += render_tree_text(root, children_map, name_to_cls, max_depth=max_depth) + if tree_text: + st.code(tree_text, language=None) + else: + st.info("No root classes found. The ontology may use a flat structure.") + + # ── Relation Hierarchy section ── + with st.expander("Relation Hierarchy", expanded=True): + rel_children_map, rel_roots, flat_concrete, rel_by_name = build_relation_tree(relations) + + rel_view_mode = st.radio("View", ["Interactive", "Text"], + horizontal=True, key="rel_view") + + # Legend + st.markdown( + "🔷 = abstract relation    🟣 = concrete relation", + unsafe_allow_html=True + ) + st.divider() + + if not rel_roots and not flat_concrete: + st.info("No relations defined yet. Add relations in the sidebar editor.") + elif rel_view_mode == "Interactive": + for root_rel in sorted(rel_roots): + render_interactive_rel_tree(root_rel, rel_children_map, rel_by_name) + if flat_concrete: + st.markdown("**Ungrouped concrete relations**") + for name in sorted(flat_concrete): + r = rel_by_name[name] + domain = r.get("domain_class", "?") + rng = r.get("range_class", "?") + desc = r.get("description", "") + st.markdown(f'
🟣 {r.get("label", name)} ({domain} -> {rng})
', + unsafe_allow_html=True) + if desc: + st.markdown(f'
{desc}
', + unsafe_allow_html=True) + else: + tree_text = "" + for root_rel in sorted(rel_roots): + tree_text += render_rel_tree_text(root_rel, rel_children_map, rel_by_name) + if flat_concrete: + tree_text += "\n(ungrouped concrete relations)\n" + for name in sorted(flat_concrete): + r = rel_by_name[name] + tree_text += f" {r.get('label', name)} ({r.get('domain_class', '?')} -> {r.get('range_class', '?')})\n" + if tree_text: + st.code(tree_text, language=None) + else: + st.info("No relation hierarchy defined. Relations are flat.") + + # --- Tab 2: Ontology Graph (merged Graph + Relations + Semantic Model) --- + with tab_graph: + st.header("Ontology Graph") + + # Controls row + col_g1, col_g2, col_g3, col_g4 = st.columns([1, 1, 1, 1]) + with col_g1: + max_nodes = st.slider("Max nodes", 10, 500, min(len(classes), 50), key="graph_max") + with col_g2: + show_rels = st.checkbox("Show relation edges", value=True, key="graph_rels") + with col_g3: + physics_on = st.checkbox("Physics simulation", value=True, key="graph_physics") + with col_g4: + layer_mode = st.radio("Layer", ["All", "Concrete", "Abstract"], + horizontal=True, key="graph_layer") + layer_filter = layer_mode.lower() + + # Legend bar (extended with diff markers and abstract relations) + st.markdown( + '
' + '' + '' + ' Mapped to View' + '' + '' + ' Covered as Rows' + '' + '' + ' Unmapped' + '' + '' + ' Abstract Class' + '' + '' + ' subClassOf' + '' + '' + ' Concrete Rel' + '' + '' + ' Abstract Rel' + '' + '' + ' Specializes' + '' + '' + ' New' + '' + '' + ' Modified' + '
', + unsafe_allow_html=True, + ) + + # Two-panel layout: graph (left) + detail (right) + col_graph, col_detail = st.columns([7, 3]) + + with col_graph: + # Build nodes and edges with diff markers + ag_nodes, ag_edges = build_agraph_nodes_edges( + classes, relations, coverage_map, + max_nodes=max_nodes, show_relations=show_rels, + added_classes=st.session_state["added_classes"], + modified_classes=st.session_state["modified_classes"], + added_relations=st.session_state["added_relations"], + layer_filter=layer_filter, + ) + + config = Config( + width=900, + height=600, + physics=physics_on, + layout={"hierarchical": False}, + ) + + # agraph returns the clicked node ID + selected_node = agraph(nodes=ag_nodes, edges=ag_edges, config=config) + + with col_detail: + if selected_node: + render_node_detail(selected_node, classes, relations, coverage_map, sem_model, deployed_objects) + # Inline edit form below the detail panel + _render_edit_class_form(selected_node) + else: + render_default_detail(classes, relations, coverage_map, sem_model, deployed_objects) + + # --- Tab 3: Coverage Matrix --- + with tab_coverage: + cov_map = coverage_map # reuse + has_coverage_source = bool(sem_model or deployed_objects) + + if has_coverage_source: + # --- Header: Original Design → Snowflake Implementation --- + if deployed_objects and not sem_model: + source_label = deployed_objects.get("source", "Ontology Design") + db = deployed_objects.get("database", "") + schema = deployed_objects.get("schema", "") + target_label = f"{db}.{schema}" if db and schema else "Snowflake" + st.header("Original Design → Snowflake Implementation") + st.caption( + f"Mapping from **{source_label}** to deployed objects in **{target_label}**" + ) + else: + st.header("Ontology-to-Table Coverage") + + directly_mapped = [c for c in classes if cov_map[c["name"]]["status"] == "mapped"] + covered_by_parent = [ + (c, cov_map[c["name"]]["covering_ancestor"]) + for c in classes if cov_map[c["name"]]["status"] == "covered" + ] + truly_unmapped = [c for c in classes if cov_map[c["name"]]["status"] == "unmapped"] + + # Summary metrics + total_concrete = sum(1 for c in classes if not c.get("is_abstract")) + total_abstract = sum(1 for c in classes if c.get("is_abstract")) + concrete_mapped = [c for c in directly_mapped if not c.get("is_abstract")] + concrete_covered_count = len(concrete_mapped) + len(covered_by_parent) + + mcol1, mcol2, mcol3, mcol4 = st.columns(4) + mcol1.metric("Total Classes", len(classes)) + mcol2.metric("Abstract (no view needed)", total_abstract) + mcol3.metric("Concrete Implemented", concrete_covered_count) + mcol4.metric("Not Yet Mapped", len(truly_unmapped)) + + if total_concrete > 0: + ratio = min(concrete_covered_count / total_concrete, 1.0) + st.progress(ratio, + text=f"{concrete_covered_count}/{total_concrete} concrete classes " + f"mapped to Snowflake ({100 * ratio:.0f}%)") + + # Three-column detail view + col1, col2, col3 = st.columns(3) + with col1: + st.subheader(f"Mapped to Snowflake ({len(directly_mapped)})") + if deployed_objects and not sem_model: + st.caption("Ontology class has dedicated Snowflake view(s)") + else: + st.caption("Has a dedicated view in the semantic model") + for c in sorted(directly_mapped, key=lambda x: x.get("label", x["name"])): + view_name = cov_map[c["name"]].get("view_name", "") + short = view_name + if "VW_ONT_" in view_name.upper(): + short = "VW_ONT_" + view_name.upper().split("VW_ONT_", 1)[1] + elif "V_" in view_name.upper(): + short = view_name.upper() + # Show all Snowflake objects for this class if available + sf_objs = cov_map[c["name"]].get("sf_objects", []) + if sf_objs and len(sf_objs) > 1: + st.write(f"- **{c.get('label', c['name'])}** →") + for obj in sf_objs: + st.write(f" - `{obj}`") + else: + st.write(f"- **{c.get('label', c['name'])}** → `{short}`") + with col2: + st.subheader(f"Covered by Ancestor ({len(covered_by_parent)})") + st.caption("No own view, but included as rows in a parent's view") + by_ancestor: dict[str, list] = defaultdict(list) + for cls, ancestor in covered_by_parent: + by_ancestor[ancestor].append(cls) + for ancestor in sorted(by_ancestor.keys()): + children = by_ancestor[ancestor] + with st.expander(f"via **{ancestor}** ({len(children)} classes)"): + for c in sorted(children, key=lambda x: x.get("label", x["name"])): + st.write(f"- {c.get('label', c['name'])}") + with col3: + st.subheader(f"Unmapped ({len(truly_unmapped)})") + st.caption("No Snowflake view or ancestor view covers this class") + for c in sorted(truly_unmapped, key=lambda x: x.get("label", x["name"])): + st.write(f"- {c.get('label', c['name'])}") + if not truly_unmapped: + st.success("All concrete classes are implemented in Snowflake!") + + # --- Full Artifact Inventory (deployed-objects mode) --- + if deployed_objects and not sem_model: + st.divider() + with st.expander("Full Artifact Inventory — All Snowflake Objects Generated"): + inv_col1, inv_col2 = st.columns(2) + with inv_col1: + views = deployed_objects.get("views", []) + if views: + st.markdown(f"**Views ({len(views)})**") + for v in sorted(views): + st.write(f"- `{v}`") + + tables = deployed_objects.get("tables", []) + if tables: + st.markdown(f"**Tables ({len(tables)})**") + for t in sorted(tables): + st.write(f"- `{t}`") + with inv_col2: + procs = deployed_objects.get("procedures", []) + if procs: + st.markdown(f"**Stored Procedures ({len(procs)})**") + for p in sorted(procs): + st.write(f"- `{p}`") + + udfs = deployed_objects.get("udfs", []) + if udfs: + st.markdown(f"**UDFs / Graph Tools ({len(udfs)})**") + for u in sorted(udfs): + st.write(f"- `{u}`") + + # --- Relation mapping --- + relation_to_objects = deployed_objects.get("relation_to_objects", {}) + if relation_to_objects: + st.divider() + st.subheader("Relation → Snowflake Mapping") + for rel_name, rel_info in sorted(relation_to_objects.items()): + view = rel_info.get("view", "") + meta = rel_info.get("metadata_row", "") + parts = [f"`{view}`" if view else None, f"`{meta}`" if meta else None] + mapped_to = ", ".join(p for p in parts if p) + st.write(f"- **{rel_name}** → {mapped_to}") + + else: + # Phase 3 pre-deployment: show design only + st.header("Ontology Design — Coverage") + st.info("Coverage mapping will be available after Phase 4 deployment. " + "Currently showing the ontology design structure.") + abstract_count = sum(1 for c in classes if c.get("is_abstract")) + concrete_count = len(classes) - abstract_count + st.metric("Abstract Classes (structural groupings)", abstract_count) + st.metric("Concrete Classes (will become Snowflake views)", concrete_count) + + +if __name__ == "__main__": + main()