diff --git a/.github/scripts/setup.py b/.github/scripts/setup.py index 6715b0d5..f8eb9d4f 100644 --- a/.github/scripts/setup.py +++ b/.github/scripts/setup.py @@ -33,6 +33,12 @@ long_description_content_type = "text/markdown", url="https://github.com/evgskv/logica", packages=setuptools.find_namespace_packages(), + package_data={ + # The release script clones the repo into a `logica/` folder and builds a + # namespace package, so this package is named `logica.parser_cpp`. + # Ship the C++ source so the runtime bridge can build liblogica_parse_cpp.so. + 'logica.parser_cpp': ['logica_parse.cpp'], + }, classifiers = [ "Topic :: Database", "License :: OSI Approved :: Apache Software License" diff --git a/colab_logica.py b/colab_logica.py index 4766bd90..353c8f88 100755 --- a/colab_logica.py +++ b/colab_logica.py @@ -551,3 +551,11 @@ def PostgresJumpStart(): print('Installation succeeded. Connecting...') # Connect to the database. ConnectToLocalPostgres() + + +def UseCppParser(): + os.environ['LOGICA_PARSER'] = 'CPP' + from .parser_cpp import logica_parse_cpp # type: ignore + so_path = logica_parse_cpp.EnsureCppParserSharedObject() + logica_parse_cpp.LoadCppParserLib() + return so_path diff --git a/integration_tests/duckdb_json_test.l b/integration_tests/duckdb_json_test.l index c1f6f351..e7015e69 100644 --- a/integration_tests/duckdb_json_test.l +++ b/integration_tests/duckdb_json_test.l @@ -16,7 +16,7 @@ Data() = Join(['{"a": 1, "b": [1,2]', ' "c": {"d": "A", "e": [1,2,3]}}'], ","); Data() = '{"a": 2, "b": [3,4], "c": {"d": "B", "e": [4,5,6]}}'; -Test(a: record.a, e_element: x, record:) order_by("record") :- +Test(a: record.a, e_element: x, record:) order_by("record", "e_element") :- d = Data(), record = Cast(d, {a: Num, b: [Num], c: {d: Str, e: [Num]}}), x in record.c.e; diff --git a/parser_cpp/logica_parse.cpp b/parser_cpp/logica_parse.cpp index 7d78df5b..0d69ba16 100644 --- a/parser_cpp/logica_parse.cpp +++ b/parser_cpp/logica_parse.cpp @@ -47,6 +47,7 @@ #include #include #include +#include namespace logica::parser { @@ -55,6 +56,7 @@ namespace logica::parser { // ------------------------------ struct Json; + using JsonObject = std::map; using JsonArray = std::vector; @@ -110,6 +112,35 @@ struct Json { return out; } + // Match the Python parser's dict insertion order for key traversal. + // This matters because some downstream code walks dicts in insertion order + // (e.g. type inference assigns stable-but-order-dependent type_id values). + static int KeyPriority(std::string_view k) { + // High-level rule structure. + if (k == "head") return 0; + if (k == "body") return 1; + if (k == "full_text") return 99; + + // Common predicate/call shapes. + if (k == "predicate_name") return 0; + if (k == "record") return 1; + if (k == "field_value") return 0; + if (k == "field") return 0; + if (k == "value") return 1; + + // Expressions. + if (k == "expression") return 0; + if (k == "literal") return 1; + if (k == "variable") return 2; + if (k == "predicate") return 3; + + // Bodies. + if (k == "conjunction") return 0; + if (k == "conjunct") return 1; + + return 50; + } + void Dump(std::ostream& os, bool pretty, int indent, int depth) const { const auto pad = [&](int d) { if (!pretty) return; @@ -143,13 +174,25 @@ struct Json { os << '{'; if (!o.empty()) { if (pretty) os << '\n'; - size_t i = 0; - for (const auto& [k, val] : o) { + // Iterate keys in a stable Python-like order. + std::vector*> items; + items.reserve(o.size()); + for (const auto& kv : o) items.push_back(&kv); + std::sort(items.begin(), items.end(), [](const auto* a, const auto* b) { + int pa = KeyPriority(a->first); + int pb = KeyPriority(b->first); + if (pa != pb) return pa < pb; + return a->first < b->first; + }); + + for (size_t i = 0; i < items.size(); ++i) { + const auto& k = items[i]->first; + const auto& val = items[i]->second; pad(depth + 1); os << '"' << Escape(k) << '"' << ':'; if (pretty) os << ' '; val.Dump(os, pretty, indent, depth + 1); - if (++i < o.size()) os << ','; + if (i + 1 < items.size()) os << ','; if (pretty) os << '\n'; } pad(depth); @@ -222,6 +265,189 @@ struct SpanString { } }; +// ------------------------------ +// Pooled heritage emission (for Python bridge). +// ------------------------------ + +struct HeritagePool { + struct Entry { + std::shared_ptr bytes; + bool has_non_ascii = false; + std::vector byte_to_char; // size = bytes->size() + 1, built lazily. + }; + + std::unordered_map index; + std::vector heritage; + + int64_t Intern(const std::shared_ptr& h) { + const std::string* p = h.get(); + auto it = index.find(p); + if (it != index.end()) return it->second; + int64_t idx = static_cast(heritage.size()); + index[p] = idx; + Entry e; + e.bytes = h; + e.has_non_ascii = false; + for (unsigned char c : *h) { + if (c >= 0x80) { e.has_non_ascii = true; break; } + } + heritage.push_back(std::move(e)); + return idx; + } + + int64_t ByteOffsetToCharOffset(int64_t idx, int64_t byte_offset) { + if (idx < 0 || static_cast(idx) >= heritage.size()) return 0; + Entry& e = heritage[static_cast(idx)]; + const std::string& s = *(e.bytes); + if (byte_offset < 0) byte_offset = 0; + if (static_cast(byte_offset) > s.size()) byte_offset = static_cast(s.size()); + if (!e.has_non_ascii) return byte_offset; + if (e.byte_to_char.empty()) { + e.byte_to_char.resize(s.size() + 1); + int64_t chars = 0; + e.byte_to_char[0] = 0; + for (size_t i = 0; i < s.size(); ++i) { + unsigned char c = static_cast(s[i]); + // UTF-8 continuation bytes are 10xxxxxx. + if ((c & 0xC0) != 0x80) { + ++chars; + } + e.byte_to_char[i + 1] = chars; + } + } + return e.byte_to_char[static_cast(byte_offset)]; + } +}; + +static thread_local bool g_emit_pooled_heritage = false; +static thread_local HeritagePool* g_heritage_pool = nullptr; + +static Json SpanRefJson(const SpanString& s) { + if (!g_emit_pooled_heritage || !g_heritage_pool) { + return Json(s.str()); + } + int64_t idx = g_heritage_pool->Intern(s.heritage); + // Hot path: keep spans compact to reduce JSON size and Python json.loads + // allocations. + // Encoding (v2): [idx, start, stop] + // Legacy encoding (v1): ["__hs", idx, start, stop] + JsonArray a; + a.reserve(3); + a.push_back(Json(idx)); + a.push_back(Json(static_cast(s.start))); + a.push_back(Json(static_cast(s.stop))); + return Json(a); +} + +static std::string SpanTextFromJson(const Json& j) { + if (j.is_string()) return j.as_string(); + if (j.is_array()) { + const auto& a = j.as_array(); + // v2: [idx, start, stop] + if (a.size() == 3 && a[0].is_int() && g_heritage_pool) { + const int64_t idx = a[0].as_int(); + int64_t start = a[1].as_int(); + int64_t stop = a[2].as_int(); + const auto& h = *g_heritage_pool->heritage.at(idx).bytes; + if (start < 0) start = 0; + if (stop < start) stop = start; + if (static_cast(stop) > h.size()) stop = static_cast(h.size()); + return h.substr(static_cast(start), static_cast(stop - start)); + } + // v1: ["__hs", idx, start, stop] + if (a.size() == 4 && a[0].is_string() && a[0].as_string() == "__hs" && g_heritage_pool) { + const int64_t idx = a[1].as_int(); + int64_t start = a[2].as_int(); + int64_t stop = a[3].as_int(); + const auto& h = *g_heritage_pool->heritage.at(idx).bytes; + if (start < 0) start = 0; + if (stop < start) stop = start; + if (static_cast(stop) > h.size()) stop = static_cast(h.size()); + return h.substr(static_cast(start), static_cast(stop - start)); + } + } + if (j.is_object()) { + const auto& o = j.as_object(); + auto it = o.find("__hs"); + if (it != o.end() && g_heritage_pool) { + int64_t idx = it->second.as_int(); + int64_t start = o.count("start") ? o.at("start").as_int() : 0; + int64_t stop = o.count("stop") ? o.at("stop").as_int() : static_cast(g_heritage_pool->heritage.at(idx).bytes->size()); + const auto& h = *g_heritage_pool->heritage.at(idx).bytes; + if (start < 0) start = 0; + if (stop < start) stop = start; + if (static_cast(stop) > h.size()) stop = static_cast(h.size()); + return h.substr(static_cast(start), static_cast(stop - start)); + } + } + return std::string(); +} + +static SpanString SpanFromJson(const Json& j) { + if (j.is_string()) return SpanString(j.as_string()); + if (j.is_array()) { + const auto& a = j.as_array(); + // v2: [idx, start, stop] + if (a.size() == 3 && a[0].is_int() && g_heritage_pool) { + const int64_t idx = a[0].as_int(); + const size_t start = static_cast(a[1].as_int()); + const size_t stop = static_cast(a[2].as_int()); + return SpanString(g_heritage_pool->heritage.at(idx).bytes, start, stop); + } + // v1: ["__hs", idx, start, stop] + if (a.size() == 4 && a[0].is_string() && a[0].as_string() == "__hs" && g_heritage_pool) { + const int64_t idx = a[1].as_int(); + const size_t start = static_cast(a[2].as_int()); + const size_t stop = static_cast(a[3].as_int()); + return SpanString(g_heritage_pool->heritage.at(idx).bytes, start, stop); + } + } + if (j.is_object()) { + const auto& o = j.as_object(); + if (o.count("__hs") && g_heritage_pool) { + int64_t idx = o.at("__hs").as_int(); + size_t start = o.count("start") ? static_cast(o.at("start").as_int()) : 0; + size_t stop = o.count("stop") ? static_cast(o.at("stop").as_int()) : g_heritage_pool->heritage.at(idx).bytes->size(); + return SpanString(g_heritage_pool->heritage.at(idx).bytes, start, stop); + } + } + return SpanString(SpanTextFromJson(j)); +} + +static void ConvertSpanOffsetsToChar(Json& node) { + if (!g_heritage_pool) return; + if (node.is_array()) { + auto& a = node.as_array(); + // Span encoding: ["__hs", idx, start, stop] + if (a.size() == 4 && a[0].is_string() && a[0].as_string() == "__hs") { + const int64_t idx = a[1].as_int(); + const int64_t start_b = a[2].as_int(); + const int64_t stop_b = a[3].as_int(); + a[2] = Json(g_heritage_pool->ByteOffsetToCharOffset(idx, start_b)); + a[3] = Json(g_heritage_pool->ByteOffsetToCharOffset(idx, stop_b)); + return; + } + for (auto& v : a) ConvertSpanOffsetsToChar(v); + return; + } + if (!node.is_object()) return; + auto& o = node.as_object(); + const bool is_span = o.count("__hs") && o.count("start") && o.count("stop") && o.size() <= 3; + if (is_span) { + const int64_t idx = o.at("__hs").as_int(); + const int64_t start_b = o.at("start").as_int(); + const int64_t stop_b = o.at("stop").as_int(); + const int64_t start_c = g_heritage_pool->ByteOffsetToCharOffset(idx, start_b); + const int64_t stop_c = g_heritage_pool->ByteOffsetToCharOffset(idx, stop_b); + o["start"] = Json(start_c); + o["stop"] = Json(stop_c); + return; + } + for (auto& kv : o) { + ConvertSpanOffsetsToChar(kv.second); + } +} + // ------------------------------ // Parsing exception. // ------------------------------ @@ -953,7 +1179,7 @@ static Json ParseRecordInternals(const SpanString& in, bool is_record_literal, b JsonObject agg; agg["operator"] = Json(op.str()); agg["argument"] = ParseExpression(expr); - agg["expression_heritage"] = Json(value.str()); + agg["expression_heritage"] = SpanRefJson(value); JsonObject fv; fv["field"] = Json(field.str()); @@ -1107,7 +1333,7 @@ static Json BuildTreeForCombine(const Json& parsed_expression, const SpanString& JsonObject agg; agg["operator"] = Json(op.str()); agg["argument"] = parsed_expression; - agg["expression_heritage"] = Json(full_text.str()); + agg["expression_heritage"] = SpanRefJson(full_text); JsonObject agg_fv; agg_fv["field"] = Json("logica_value"); @@ -1120,7 +1346,7 @@ static Json BuildTreeForCombine(const Json& parsed_expression, const SpanString& JsonObject result; result["head"] = Json(head); result["distinct_denoted"] = Json(true); - result["full_text"] = Json(full_text.str()); + result["full_text"] = SpanRefJson(full_text); if (parsed_body) { result["body"] = Json(JsonObject{{"conjunction", *parsed_body}}); } @@ -1205,7 +1431,7 @@ static std::optional ParseConciseCombine(const SpanString& s) { Json right_expr = BuildTreeForCombine(parsed_expression, op, parsed_body ? &*parsed_body : nullptr, s); JsonObject rhs; rhs["combine"] = right_expr; - rhs["expression_heritage"] = Json(s.str()); + rhs["expression_heritage"] = SpanRefJson(s); JsonObject uni; uni["left_hand_side"] = left_expr; uni["right_hand_side"] = Json(rhs); @@ -1306,12 +1532,12 @@ static Json NegationTree(const SpanString& s, const Json& negated_proposition) { JsonObject combine; combine["body"] = negated_proposition; combine["distinct_denoted"] = Json(true); - combine["full_text"] = Json(s.str()); + combine["full_text"] = SpanRefJson(s); JsonObject agg; agg["operator"] = Json("Min"); agg["argument"] = number_one; - agg["expression_heritage"] = Json(s.str()); + agg["expression_heritage"] = SpanRefJson(s); JsonObject fv; fv["field"] = Json("logica_value"); fv["value"] = Json(JsonObject{{"aggregation", Json(agg)}}); @@ -1510,7 +1736,7 @@ static Json ActuallyParseExpression(const SpanString& s) { static Json ParseExpression(const SpanString& s) { Json e = ActuallyParseExpression(s); - e.as_object()["expression_heritage"] = Json(s.str()); + e.as_object()["expression_heritage"] = SpanRefJson(s); return e; } @@ -1577,7 +1803,7 @@ static std::pair ParseHeadCall(const SpanString& s, bool distinct_fr JsonObject agg; agg["operator"] = Json(op_str.str()); agg["argument"] = ParseExpression(expr_str); - agg["expression_heritage"] = Json(post_call_str.str()); + agg["expression_heritage"] = SpanRefJson(post_call_str); JsonObject fv; fv["field"] = Json("logica_value"); fv["value"] = Json(JsonObject{{"aggregation", Json(agg)}}); @@ -1602,7 +1828,7 @@ static std::optional ParseFunctorRule(const SpanString& s) { Json applicant = Json(JsonObject{{"expression", Json(JsonObject{{"literal", Json(JsonObject{{"the_predicate", Json(JsonObject{{"predicate_name", definition.as_object().at("predicate_name")}})}})}})}}); Json arguments = Json(JsonObject{{"expression", Json(JsonObject{{"record", definition.as_object().at("record")}})}}); JsonObject rule; - rule["full_text"] = Json(s.str()); + rule["full_text"] = SpanRefJson(s); JsonObject head; head["predicate_name"] = Json("@Make"); JsonArray fvs; @@ -1690,7 +1916,7 @@ static Json ParseRule(const SpanString& s) { if (parts.size() == 2) { result["body"] = ParseProposition(parts[1]); } - result["full_text"] = Json(s.str()); + result["full_text"] = SpanRefJson(s); return Json(result); } @@ -1815,7 +2041,7 @@ static Json MultiBodyAggregationRewrite(const Json& rules_json) { JsonArray new_rules; std::map agg_fvs_per_pred; - std::map original_full_text; + std::map original_full_text; auto split_aggregation = [&](const Json& rule) -> std::pair { Json r = rule; @@ -1852,14 +2078,14 @@ static Json MultiBodyAggregationRewrite(const Json& rules_json) { for (const auto& rule : rules) { std::string name = rule.as_object().at("head").as_object().at("predicate_name").as_string(); - original_full_text[name] = rule.as_object().at("full_text").as_string(); + original_full_text[name] = rule.as_object().at("full_text"); if (std::find(multi.begin(), multi.end(), name) != multi.end()) { auto [aggregation_fvs, new_rule] = split_aggregation(rule); if (agg_fvs_per_pred.count(name)) { Json expected = StripAggregationHeritage(agg_fvs_per_pred[name].as_object().at("field_value")); Json observed = StripAggregationHeritage(aggregation_fvs); if (expected.ToString(false) != observed.ToString(false)) { - throw ParsingException("Signature differs for bodies.", SpanString(rule.as_object().at("full_text").as_string())); + throw ParsingException("Signature differs for bodies.", SpanFromJson(rule.as_object().at("full_text"))); } } else { agg_fvs_per_pred[name] = Json(JsonObject{{"field_value", aggregation_fvs}}); @@ -1899,7 +2125,7 @@ static Json MultiBodyAggregationRewrite(const Json& rules_json) { body["conjunction"] = Json(conjunction); aggregating_rule["body"] = Json(body); } - aggregating_rule["full_text"] = Json(original_full_text[name]); + aggregating_rule["full_text"] = original_full_text[name]; aggregating_rule["distinct_denoted"] = Json(true); new_rules.push_back(Json(aggregating_rule)); } @@ -2145,16 +2371,25 @@ static Json ParseFileInternal(const std::string& content, continue; } + // Match Python parser semantics: + // - `Split(s, ';')` returns a slice whose *heritage* is the full program. + // - Python then wraps each non-import statement as `HeritageAwareString(str_statement)` + // which resets `.heritage` to the statement text itself. + // + // Downstream error reporting/type inference expects `expression_heritage` to be + // rooted in the rule text (statement), not the entire program. + SpanString statement{st.str()}; + std::optional rule; - if (auto ann = ParseFunctionRuleImpl(st)) { + if (auto ann = ParseFunctionRuleImpl(statement)) { rules.push_back(ann->first); rule = ann->second; } if (!rule) { - rule = ParseFunctorRule(st); + rule = ParseFunctorRule(statement); } if (!rule) { - Json r = ParseRule(st); + Json r = ParseRule(statement); if (!r.is_null()) { auto anns = AnnotationsFromDenotations(r); for (const auto& a : anns) rules.push_back(a); @@ -2325,12 +2560,90 @@ int logica_cpp_parse_rules_json(const char* program_text, logica::parser::Json parsed = logica::parser::ParseFile(content, fname, import_root); std::string out; if (full) { - out = parsed.ToString(true, 1); + // C ABI is consumed by Python and immediately parsed; compact JSON is + // significantly faster to serialize and decode than pretty output. + out = parsed.ToString(false, 1); } else { const auto& obj = parsed.as_object(); auto it = obj.find("rule"); - out = (it == obj.end()) ? std::string("[]") : it->second.ToString(true, 1); + out = (it == obj.end()) ? std::string("[]") : it->second.ToString(false, 1); + } + if (out_json) { + *out_json = DupToMalloc(out); } + return 0; + } catch (const logica::parser::ParsingException& e) { + std::ostringstream oss; + e.ShowMessage(oss); + if (out_err) { + *out_err = DupToMalloc(oss.str()); + } + return 1; + } catch (const std::exception& e) { + std::string msg = std::string("Error: ") + e.what() + "\n"; + if (out_err) { + *out_err = DupToMalloc(msg); + } + return 2; + } +} + +// Same as logica_cpp_parse_rules_json, but emits pooled heritage spans. +// +// Output JSON shape: +// {"__string_table": [], "tree": } +// +// In the returned tree, `full_text` and `expression_heritage` are encoded as: +// ["__hs", , , ] +int logica_cpp_parse_rules_json_pooled(const char* program_text, + const char* file_name, + const char* logicapath, + int full, + void** out_json, + void** out_err) { + if (out_json) *out_json = nullptr; + if (out_err) *out_err = nullptr; + + logica::parser::HeritagePool pool; + struct Reset { + ~Reset() { + logica::parser::g_emit_pooled_heritage = false; + logica::parser::g_heritage_pool = nullptr; + } + } reset; + logica::parser::g_emit_pooled_heritage = true; + logica::parser::g_heritage_pool = &pool; + + try { + const std::string content = program_text ? std::string(program_text) : std::string(); + const std::string fname = file_name ? std::string(file_name) : std::string("main"); + std::vector import_root = SplitLogicapath(logicapath); + + logica::parser::Json parsed = logica::parser::ParseFile(content, fname, import_root); + + // Prepare string table. + logica::parser::JsonArray table; + table.reserve(pool.heritage.size()); + for (const auto& e : pool.heritage) { + table.push_back(logica::parser::Json(e.bytes ? *(e.bytes) : std::string())); + } + + logica::parser::Json tree; + if (full) { + tree = parsed; + } else { + const auto& obj = parsed.as_object(); + auto it = obj.find("rule"); + tree = (it == obj.end()) ? logica::parser::Json(logica::parser::JsonArray{}) : it->second; + } + + logica::parser::JsonObject wrapped; + wrapped["__string_table"] = logica::parser::Json(table); + wrapped["tree"] = tree; + + // Compact JSON for performance; Python re-dumps in canonical form when + // needed for diffs. + std::string out = logica::parser::Json(wrapped).ToString(false, 1); if (out_json) { *out_json = DupToMalloc(out); } diff --git a/parser_cpp/logica_parse_cpp.py b/parser_cpp/logica_parse_cpp.py index 0b615f11..85f4a9ac 100644 --- a/parser_cpp/logica_parse_cpp.py +++ b/parser_cpp/logica_parse_cpp.py @@ -40,54 +40,262 @@ _LIB: Optional[ctypes.CDLL] = None +# NOTE: This bridge always prefers the pooled-heritage C++ ABI when present. +# The previous env toggles were removed to prevent accidental benchmarking or +# correctness testing of the wrong mode. -_KEY_ORDER_PRIORITY = { - # High-level rule structure. - 'head': 0, - 'body': 1, - 'full_text': 99, - # Common predicate/call shapes. - 'predicate_name': 0, - 'record': 1, - 'field_value': 0, - 'field': 0, - 'value': 1, +def _WrapHeritageAwareStrings(node, heritage_root=None, _match_state=None): + """Rehydrates heritage-aware strings for parity with the Python parser. - # Expressions. - 'expression': 0, - 'literal': 1, - 'variable': 2, - 'predicate': 3, + The Python parser uses `parser_py.parse.HeritageAwareString` for fields like + `expression_heritage` (and `full_text`) so error reporting/type inference can + call `.Display()` and highlight the relevant span within the *full* statement. - # Bodies. - 'conjunction': 0, - 'conjunct': 1, -} + The C++ parser serializes these fields as plain JSON strings. We can't always + reconstruct exact spans (ambiguity, whitespace normalization, repeated + substrings), but we can do a best-effort rehydration: + - Always wrap `full_text` into `HeritageAwareString`. + - Wrap `expression_heritage` into `HeritageAwareString`. + - If a surrounding `full_text` exists, try to locate `expression_heritage` + within it and set `.start/.stop/.heritage` accordingly. -def _NormalizeKeyOrder(node): - """Recursively normalizes dict insertion order for downstream determinism. - - The C++ parser produces JSON objects backed by `std::map`, so keys are - serialized in lexicographic order. The Python parser creates dicts with - a more semantic insertion order (e.g. head, body, full_text). - - Some downstream code (notably type inference) walks dicts in insertion order, - which can affect outcomes like assigned `type_id`s. - - This function rebuilds dictionaries to approximate the Python parser's key - order while preserving values exactly. + This keeps downstream code working and typically restores meaningful context + in `.Display()`. """ + if _match_state is None: + # Tracks matching progress for repeated substrings within the same + # `full_text` heritage. Keyed by (root_id, substring). + _match_state = {} + if isinstance(node, list): - return [_NormalizeKeyOrder(x) for x in node] + return [_WrapHeritageAwareStrings(x, heritage_root, _match_state) for x in node] + if isinstance(node, dict): - items = list(node.items()) - items.sort(key=lambda kv: (_KEY_ORDER_PRIORITY.get(kv[0], 50), kv[0])) - return {k: _NormalizeKeyOrder(v) for k, v in items} + parse_mod = _GetParseModule() + HeritageAwareString = getattr(parse_mod, 'HeritageAwareString', str) + + def _needs_alignment(x) -> bool: + if not isinstance(x, HeritageAwareString): + return True + try: + text = str(x) + return ( + getattr(x, 'heritage', text) == text and + getattr(x, 'start', 0) == 0 and + getattr(x, 'stop', len(text)) == len(text) + ) + except Exception: # pylint: disable=broad-exception-caught + return True + + local_root = heritage_root + if 'full_text' in node and isinstance(node.get('full_text'), str): + full_text_value = node.get('full_text') + if not isinstance(full_text_value, HeritageAwareString): + full_text_value = HeritageAwareString(full_text_value) + local_root = full_text_value + + result = {} + for key, value in node.items(): + if key == 'full_text' and local_root is not None: + result[key] = local_root + continue + + if key == 'expression_heritage' and isinstance(value, str): + # If expression_heritage is exactly the same text as the surrounding + # full_text, reuse the same HeritageAwareString instance. This avoids + # retaining multiple copies of large rule texts that JSON decoding would + # otherwise duplicate. + if local_root is not None and str(value) == str(local_root): + result[key] = local_root + continue + + expr_value = value + if not isinstance(expr_value, HeritageAwareString): + expr_value = HeritageAwareString(expr_value) + + if local_root is not None and _needs_alignment(expr_value): + heritage_text = getattr(local_root, 'heritage', str(local_root)) + substring = str(expr_value) + root_id = id(local_root) + start_from = _match_state.get((root_id, substring), 0) + idx = str(heritage_text).find(substring, start_from) + if idx == -1 and start_from: + # Fallback if traversal order doesn't align with textual order. + idx = str(heritage_text).find(substring) + if idx != -1: + expr_value.heritage = heritage_text + expr_value.start = idx + expr_value.stop = idx + len(substring) + _match_state[(root_id, substring)] = idx + len(substring) + + result[key] = expr_value + continue + + result[key] = _WrapHeritageAwareStrings(value, local_root, _match_state) + return result + return node +def _DecodePooledHeritageOutput(node): + """Decodes pooled-heritage JSON output from the C++ parser. + + When available, the C++ shared library can emit a wrapper JSON object: + {"__string_table": [...], "tree": ...} + + In the tree, spans are represented as: + ["__hs", , , ] + + (Legacy format is also accepted for forward/backward compatibility: + {"__hs": , "start": , "stop": }) + + This function reconstructs `parser_py.parse.HeritageAwareString` objects so + downstream code sees the same types as the Python parser. + """ + if not (isinstance(node, dict) and '__string_table' in node and 'tree' in node): + return node + + string_table = node.get('__string_table') + tree = node.get('tree') + if not isinstance(string_table, list): + return tree + + parse_mod = _GetParseModule() + HeritageAwareString = getattr(parse_mod, 'HeritageAwareString', str) + + table_len = len(string_table) + + # Cache: idx -> bool (isascii). Built lazily. + ascii_flags = [None] * table_len + + # Cache: idx -> (utf8_byte_len, byte_offset->char_offset mapping) + # Built lazily only for non-ascii strings. + byte_to_char_map = [None] * table_len + + # Cache: (idx, start_off, stop_off) -> HeritageAwareString + span_cache = {} + + def is_ascii(idx: int, heritage: str) -> bool: + v = ascii_flags[idx] + if v is None: + v = heritage.isascii() + ascii_flags[idx] = v + return bool(v) + + def get_map(idx: int, heritage: str): + cached = byte_to_char_map[idx] + if cached is not None: + return cached + b = heritage.encode('utf-8') + mapping = [0] * (len(b) + 1) + chars = 0 + for i, byt in enumerate(b): + # UTF-8 continuation bytes are 10xxxxxx. + if (byt & 0xC0) != 0x80: + chars += 1 + mapping[i + 1] = chars + cached = (len(b), mapping) + byte_to_char_map[idx] = cached + return cached + + def decode_span(idx: int, start_b: int, stop_b: int): + if idx < 0 or idx >= table_len: + return None + heritage = string_table[idx] + if not isinstance(heritage, str): + return None + if start_b < 0: + start_b = 0 + if stop_b < start_b: + stop_b = start_b + + cache_key = (idx, start_b, stop_b) + cached = span_cache.get(cache_key) + if cached is not None: + return cached + + if is_ascii(idx, heritage): + start_c = min(start_b, len(heritage)) + stop_c = min(stop_b, len(heritage)) + else: + blen, mapping = get_map(idx, heritage) + if start_b > blen: + start_b = blen + if stop_b > blen: + stop_b = blen + start_c = mapping[start_b] + stop_c = mapping[stop_b] + + text = heritage[start_c:stop_c] + hs = HeritageAwareString(text) + try: + hs.heritage = heritage + hs.start = start_c + hs.stop = stop_c + except Exception: # pylint: disable=broad-exception-caught + pass + span_cache[cache_key] = hs + return hs + + def decode_known_key_span(value): + # New compact encoding: [idx, start_b, stop_b]. + if isinstance(value, list) and len(value) == 3: + idx, start_b, stop_b = value + if isinstance(idx, int) and isinstance(start_b, int) and isinstance(stop_b, int): + hs = decode_span(idx, start_b, stop_b) + if hs is not None: + return hs + # Legacy v1 encoding: ["__hs", idx, start_b, stop_b]. + if isinstance(value, list) and len(value) == 4 and value and value[0] == '__hs': + idx = value[1] + start_b = value[2] + stop_b = value[3] + if isinstance(idx, int) and isinstance(start_b, int) and isinstance(stop_b, int): + hs = decode_span(idx, start_b, stop_b) + if hs is not None: + return hs + # Legacy dict encoding: {"__hs": idx, "start": b, "stop": b} + if isinstance(value, dict) and len(value) == 3 and '__hs' in value and 'start' in value and 'stop' in value: + idx = value.get('__hs') + start_b = value.get('start') + stop_b = value.get('stop') + if isinstance(idx, int) and isinstance(start_b, int) and isinstance(stop_b, int): + hs = decode_span(idx, start_b, stop_b) + if hs is not None: + return hs + return value + + def decode(x): + # Mutate containers in-place to avoid allocating a fresh list/dict for every + # node; the JSON tree returned by json.loads is not shared. + if isinstance(x, list): + for i, v in enumerate(x): + if isinstance(v, (list, dict)): + x[i] = decode(v) + return x + + if isinstance(x, dict): + # Decode spans only under known keys to avoid misinterpreting ordinary + # numeric lists elsewhere in the AST. + if 'full_text' in x: + x['full_text'] = decode_known_key_span(x.get('full_text')) + if 'expression_heritage' in x: + x['expression_heritage'] = decode_known_key_span(x.get('expression_heritage')) + + for k, v in x.items(): + if k in ('full_text', 'expression_heritage'): + continue + if isinstance(v, (list, dict)): + x[k] = decode(v) + return x + + return x + + return decode(tree) + + def _GetParseModule(): global _PARSE_MOD if _PARSE_MOD is not None: @@ -260,6 +468,18 @@ def LoadCppParserLib(repo_root: Optional[str] = None) -> ctypes.CDLL: ] lib.logica_cpp_parse_rules_json.restype = ctypes.c_int + pooled = getattr(lib, 'logica_cpp_parse_rules_json_pooled', None) + if pooled is not None: + pooled.argtypes = [ + ctypes.c_char_p, # program_text + ctypes.c_char_p, # file_name + ctypes.c_char_p, # logicapath (colon-separated) + ctypes.c_int, # full + ctypes.POINTER(ctypes.c_void_p), + ctypes.POINTER(ctypes.c_void_p), + ] + pooled.restype = ctypes.c_int + lib.logica_cpp_free.argtypes = [ctypes.c_void_p] lib.logica_cpp_free.restype = None @@ -276,7 +496,16 @@ def ParseRulesJsonNative(program_text: str, out_ptr = ctypes.c_void_p() err_ptr = ctypes.c_void_p() - rc = lib.logica_cpp_parse_rules_json( + # Pooled heritage ABI is the only supported mode for the C++ parser bridge. + # If the symbol is missing, we likely loaded a stale/older shared library. + fn = getattr(lib, 'logica_cpp_parse_rules_json_pooled', None) + if fn is None: + raise RuntimeError( + 'C++ parser shared library does not export logica_cpp_parse_rules_json_pooled. ' + 'This likely means a stale/older liblogica_parse_cpp.so is being used. ' + 'Try deleting the cache dir and re-running to rebuild.' + ) + rc = fn( program_text.encode('utf-8'), file_name.encode('utf-8'), logicapath.encode('utf-8') if logicapath else None, @@ -314,7 +543,12 @@ def ParseRules(program_text: str, if rc != 0: raise _CppParsingExceptionClass(exception_thrower)(err) try: - return _NormalizeKeyOrder(json.loads(out)) + loaded = json.loads(out) + pooled = isinstance(loaded, dict) and '__string_table' in loaded and 'tree' in loaded + loaded = _DecodePooledHeritageOutput(loaded) + if pooled: + return loaded + return _WrapHeritageAwareStrings(loaded) except Exception as e: raise RuntimeError('Failed to json-parse C++ parser output: %s' % e) from e @@ -349,6 +583,11 @@ def ParseFile(program_text: str, if rc != 0: raise _CppParsingExceptionClass(exception_thrower)(err) try: - return _NormalizeKeyOrder(json.loads(out)) + loaded = json.loads(out) + pooled = isinstance(loaded, dict) and '__string_table' in loaded and 'tree' in loaded + loaded = _DecodePooledHeritageOutput(loaded) + if pooled: + return loaded + return _WrapHeritageAwareStrings(loaded) except Exception as e: raise RuntimeError('Failed to json-parse C++ parser output: %s' % e) from e diff --git a/tools/run_heritage_error_demo.py b/tools/run_heritage_error_demo.py new file mode 100644 index 00000000..ee96de4f --- /dev/null +++ b/tools/run_heritage_error_demo.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import os +import sys + +# Ensure the repo root is importable when running as `python tools/...`. +_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + +from parser_py import parse +from type_inference.research import infer + + +def main(argv: list[str]) -> int: + demo_path = ( + argv[1] + if len(argv) > 1 + else 'type_inference/research/integration_tests/typing_heritage_error_demo.l' + ) + + # Force the C++ parser path so this demo validates the C++->Python bridge. + os.environ['LOGICA_PARSER'] = 'CPP' + + program_text = open(demo_path, 'r', encoding='utf-8').read() + parsed_rules = parse.ParseFile(program_text)['rule'] + + engine = infer.TypesInferenceEngine(parsed_rules, dialect='psql') + engine.InferTypes() + + # Raises TypeErrorCaughtException with a message that includes + # `expression_heritage.Display()`. This relies on correct heritage propagation. + infer.TypeErrorChecker(parsed_rules).CheckForError(mode='raise') + + return 0 + + +if __name__ == '__main__': + raise SystemExit(main(sys.argv)) diff --git a/type_inference/research/integration_tests/typing_heritage_error_demo.l b/type_inference/research/integration_tests/typing_heritage_error_demo.l new file mode 100644 index 00000000..a8e597be --- /dev/null +++ b/type_inference/research/integration_tests/typing_heritage_error_demo.l @@ -0,0 +1,12 @@ +@Engine("psql"); + +# This program is intentionally ill-typed. +# It triggers a type inference error that calls `expression_heritage.Display()`. +# If `expression_heritage` is not a HeritageAwareString (or span alignment is wrong), +# the resulting error loses context or crashes. +Demo(x: 1) :- + # Include a non-ASCII character before the error site, to exercise + # UTF-8 byte→char span decoding. + y == "π", + # Builtin Length expects positional argument 0, not a named field `str:`. + Length(str: "abc");