diff --git a/accounts/casbin.go b/accounts/casbin.go index ce4a576b..20318adb 100644 --- a/accounts/casbin.go +++ b/accounts/casbin.go @@ -24,12 +24,12 @@ func (ce *CasbinAccess) init() { func (ce *CasbinAccess) Enforce(user string, graph string, operation Operation) error { ce.init() - fmt.Printf("Casbin request '%s' '%s' '%s'\n", user, graph, operation) + // fmt.Printf("Casbin request '%s' '%s' '%s'\n", user, graph, operation) if res, err := ce.encforcer.Enforce(user, graph, string(operation)); res { return nil } else if err != nil { fmt.Printf("casbin error: %s\n", err) } - fmt.Printf("Not allowed: '%s' '%s' '%s'\n", user, graph, operation) + // fmt.Printf("Not allowed: '%s' '%s' '%s'\n", user, graph, operation) return fmt.Errorf("action restricted") } diff --git a/accounts/util.go b/accounts/util.go index a201376d..38c387ce 100644 --- a/accounts/util.go +++ b/accounts/util.go @@ -40,6 +40,16 @@ func (c *Config) init() { } } +func (c *Config) GetAuth() Authenticate { + c.init() + return c.auth +} + +func (c *Config) GetAccess() Access { + c.init() + return c.access +} + func (c *Config) UnaryInterceptor() grpc.UnaryServerInterceptor { c.init() return unaryAuthInterceptor(c.auth, c.access) diff --git a/cmd/load/main.go b/cmd/load/main.go index dfc7297c..5542ec6e 100644 --- a/cmd/load/main.go +++ b/cmd/load/main.go @@ -103,7 +103,11 @@ var Cmd = &cobra.Command{ log.Infof("Loaded %d edges", count) } if edgeUID && e.Id == "" { - e.Id = util.UUID() + var data map[string]interface{} + if e.Data != nil { + data = e.Data.AsMap() + } + e.Id = util.DeterministicEdgeID(e.From, e.To, e.Label, data) } elemChan <- &gripql.GraphElement{Graph: graph, Edge: e} } @@ -142,7 +146,11 @@ var Cmd = &cobra.Command{ log.Infof("Loaded %d edges", edgeCount) } if edgeUID && e.Id == "" { - e.Id = util.UUID() + var data map[string]interface{} + if e.Data != nil { + data = e.Data.AsMap() + } + e.Id = util.DeterministicEdgeID(e.From, e.To, e.Label, data) } elemChan <- &gripql.GraphElement{Graph: graph, Edge: e} } diff --git a/cmd/mongoload/main.go b/cmd/mongoload/main.go index 67312979..a25c28fc 100644 --- a/cmd/mongoload/main.go +++ b/cmd/mongoload/main.go @@ -72,7 +72,11 @@ func edgeSerialize(edgeChan chan *gripql.Edge, workers int) chan []byte { go func() { for e := range edgeChan { if edgeUID && e.Id == "" { - e.Id = util.UUID() + var data map[string]interface{} + if e.Data != nil { + data = e.Data.AsMap() + } + e.Id = util.DeterministicEdgeID(e.From, e.To, e.Label, data) } doc := mongo.PackEdge(gdbi.NewElementFromEdge(e)) rawBytes, err := bson.Marshal(doc) diff --git a/cmd/server/main.go b/cmd/server/main.go index 010f299d..fb10fa46 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -6,6 +6,7 @@ import ( "os" "os/signal" "strings" + "syscall" "github.com/bmeg/grip/config" "github.com/bmeg/grip/log" @@ -38,9 +39,11 @@ func Run(conf *config.Config, baseDir string) error { ctx, cancel := context.WithCancel(context.Background()) defer cancel() c := make(chan os.Signal, 1) - signal.Notify(c, os.Interrupt) + signal.Notify(c, os.Interrupt, syscall.SIGTERM) + defer signal.Stop(c) go func() { - <-c + sig := <-c + log.Infof("Received signal %s, starting graceful shutdown", sig.String()) cancel() }() diff --git a/config/config.go b/config/config.go index 0848f433..95e3ff0a 100644 --- a/config/config.go +++ b/config/config.go @@ -9,6 +9,7 @@ import ( "time" esql "github.com/bmeg/grip/existing-sql" + "github.com/bmeg/grip/grids" "github.com/bmeg/grip/gripper" "github.com/bmeg/grip/log" "github.com/bmeg/grip/mongo" @@ -26,7 +27,7 @@ func init() { } type DriverConfig struct { - Grids *string + Grids *grids.Config Badger *string Bolt *string Level *string @@ -121,7 +122,8 @@ func (conf *Config) AddSqliteDefault() { func (conf *Config) AddGridsDefault() { n := "grip-grids.db" - conf.Drivers["grids"] = DriverConfig{Grids: &n} + c := grids.Config{GraphDir: n, BulkLoaderWorkers: 10, Driver: "jsontable"} + conf.Drivers["grids"] = DriverConfig{Grids: &c} conf.Default = "grids" } @@ -135,6 +137,9 @@ func TestifyConfig(c *Config) { c.RPCClient.ServerAddress = c.Server.RPCAddress() + if c.Default == "" { + return + } d := c.Drivers[c.Default] if d.Badger != nil { @@ -145,6 +150,11 @@ func TestifyConfig(c *Config) { a := "grip.db." + rand d.Pebble = &a } + if d.Grids != nil { + c := *d.Grids + c.GraphDir = "grip-grids.db." + rand + d.Grids = &c + } if d.MongoDB != nil { d.MongoDB.DBName = "gripdb-" + rand } diff --git a/conformance/tests/ot_aggregations.py b/conformance/tests/ot_aggregations.py index 989eac51..d3ce1799 100644 --- a/conformance/tests/ot_aggregations.py +++ b/conformance/tests/ot_aggregations.py @@ -154,7 +154,7 @@ def test_traversal_edge_histogram_aggregation(man): if count < 2: errors.append( "Unexpected number of terms: %d != %d" % - (len(row["buckets"]), 2) + (count, 2) ) return errors diff --git a/conformance/tests/ot_basic.py b/conformance/tests/ot_basic.py index fa733bd6..4452c572 100644 --- a/conformance/tests/ot_basic.py +++ b/conformance/tests/ot_basic.py @@ -394,34 +394,22 @@ def test_limit(man): G = man.setGraph("swapi") + # Tests modified to only check cardinality since different database backends + # (e.g., Postgres) do not guarantee deterministic result ordering without + # explicit sorting. tests = [ - "G.V().limit(3)", - "G.V().outE().limit(3)" + ("G.V().limit(3)", 3), + ("G.V().outE().limit(3)", 3) ] - expected_results = [ - list(i["_id"] for i in G.V().execute())[:3], - list(i["_id"] for i in G.V().outE().execute())[:3] - ] - - for test, expected in zip(tests, expected_results): + for test, expected_len in tests: results = eval(test).execute() actual = [x["_id"] for x in results] - # check contents - for x in actual: - if x not in expected: - errors.append("Fail: %s - unexpected result - %s" % (test, x)) - # check number of results - if len(actual) != len(expected): + if len(actual) != expected_len: errors.append("Fail: %s - unexpected result count - \ - %s != %s" % (test, len(actual), len(expected))) - - # check order - if actual != expected: - errors.append("Fail: %s - unexpected order - \ - %s != %s" % (test, actual, expected)) + %s != %s" % (test, len(actual), expected_len)) return errors @@ -431,32 +419,21 @@ def test_skip(man): G = man.setGraph("swapi") + # Tests modified to only check cardinality since different database backends + # (e.g., Postgres) do not guarantee deterministic result ordering without + # explicit sorting. tests = [ - "G.V().skip(3).limit(3)", - ] - - expected_results = [ - list(i["_id"] for i in G.V().execute())[3:6], + ("G.V().skip(3).limit(3)", 3), ] - for test, expected in zip(tests, expected_results): + for test, expected_len in tests: results = eval(test).execute() actual = [x["_id"] for x in results] - # check contents - for x in actual: - if x not in expected: - errors.append("Fail: %s - unexpected result - %s" % (test, x)) - # check number of results - if len(actual) != len(expected): + if len(actual) != expected_len: errors.append("Fail: %s - unexpected result count - \ - %s != %s" % (test, len(actual), len(expected))) - - # check order - if actual != expected: - errors.append("Fail: %s - unexpected order - \ - %s != %s" % (test, actual, expected)) + %s != %s" % (test, len(actual), expected_len)) return errors @@ -466,33 +443,21 @@ def test_range(man): G = man.setGraph("swapi") + # Tests modified to only check cardinality since different database backends + # (e.g., Postgres) do not guarantee deterministic result ordering without + # explicit sorting. tests = [ - "G.V().range(3, 5)", - "G.V().range(34, -1)", - ] - - expected_results = [ - list(i["_id"] for i in G.V().execute())[3:5], - list(i["_id"] for i in G.V().execute())[34:], + ("G.V().range(3, 5)", 2), + ("G.V().range(34, -1)", 5), # 39 Total in swapi V - 34 offset ] - for test, expected in zip(tests, expected_results): + for test, expected_len in tests: results = eval(test).execute() actual = [x["_id"] for x in results] - # check contents - for x in actual: - if x not in expected: - errors.append("Fail: %s - unexpected result - %s" % (test, x)) - # check number of results - if len(actual) != len(expected): + if len(actual) != expected_len: errors.append("Fail: %s - unexpected result count - \ - %s != %s" % (test, len(actual), len(expected))) - - # check order - if actual != expected: - errors.append("Fail: %s - unexpected order - \ - %s != %s" % (test, actual, expected)) + %s != %s" % (test, len(actual), expected_len)) return errors diff --git a/conformance/tests/ot_bulk.py b/conformance/tests/ot_bulk.py index eaa2cd18..278bef2a 100644 --- a/conformance/tests/ot_bulk.py +++ b/conformance/tests/ot_bulk.py @@ -27,15 +27,67 @@ def test_bulkload(man): res = G.V().count().execute()[0] if res["count"] != 6: - errors.append( - "Bulk Add wrong number of vertices: %s != %s" % - (res["count"], 6)) + errors.append("Bulk Add wrong number of vertices: %s != %s" % (res["count"], 6)) res = G.V().outE().count().execute()[0] if res["count"] != 6: - errors.append( - "Bulk Add wrong number of edges: %s != %s" % - (res["count"], 6)) + errors.append("Bulk Add wrong number of edges: %s != %s" % (res["count"], 6)) + + return errors + + +def test_bulkload_duplicate(man): + errors = [] + + G = man.writeTest() + + bulk = G.bulkAdd() + + bulk.addVertex("1", "Person", {"name": "marko", "age": "29"}) + bulk.addVertex("2", "Person", {"name": "vadas", "age": "27"}) + bulk.addVertex("3", "Software", {"name": "lop", "lang": "java"}) + bulk.addVertex("4", "Person", {"name": "josh", "age": "32"}) + bulk.addVertex("5", "Software", {"name": "ripple", "lang": "java"}) + bulk.addVertex("6", "Person", {"name": "peter", "age": "35"}) + + bulk.addEdge("1", "3", "created", {"weight": 0.4}) + bulk.addEdge("1", "2", "knows", {"weight": 0.5}) + bulk.addEdge("1", "4", "knows", {"weight": 1.0}) + bulk.addEdge("4", "3", "created", {"weight": 0.4}) + bulk.addEdge("6", "3", "created", {"weight": 0.2}) + bulk.addEdge("4", "5", "created", {"weight": 1.0}) + + err = bulk.execute() + + bulk = G.bulkAdd() + + bulk.addVertex("1", "Person", {"name": "marko", "age": "29"}) + bulk.addVertex("2", "Person", {"name": "vadas", "age": "27"}) + bulk.addVertex("3", "Software", {"name": "lop", "lang": "java"}) + bulk.addVertex("4", "Person", {"name": "josh", "age": "32"}) + bulk.addVertex("5", "Software", {"name": "ripple", "lang": "java"}) + bulk.addVertex("6", "Person", {"name": "peter", "age": "35"}) + + bulk.addEdge("1", "3", "created", {"weight": 0.4}) + bulk.addEdge("1", "2", "knows", {"weight": 0.5}) + bulk.addEdge("1", "4", "knows", {"weight": 1.0}) + bulk.addEdge("4", "3", "created", {"weight": 0.4}) + bulk.addEdge("6", "3", "created", {"weight": 0.2}) + bulk.addEdge("4", "5", "created", {"weight": 1.0}) + + err = bulk.execute() + + if err.get("errorCount", 0) != 0: + print(err) + errors.append("Bulk insertion error") + + res = G.V().count().execute()[0] + if res["count"] != 6: + errors.append("Bulk Add wrong number of vertices: %s != %s" % (res["count"], 6)) + + res = G.V().outE().count().execute()[0] + if res["count"] != 6: + errors.append("Bulk Add wrong number of edges: %s != %s" % (res["count"], 6)) return errors @@ -89,9 +141,7 @@ def test_bulk_delete(man): G.addEdge("vertex4", "vertex5", "created", {"weight": 0.4}, id="edge8") G.addEdge("vertex4", "vertex6", "created", {"weight": 0.4}, id="edge9") - G.delete(vertices=["vertex1", "vertex2", - "vertex3"], - edges=[]) + G.delete(vertices=["vertex1", "vertex2", "vertex3"], edges=[]) Ecount = G.V().outE().count().execute()[0]["count"] Vcount = G.V().count().execute()[0]["count"] @@ -108,7 +158,6 @@ def test_bulk_delete(man): if Vcount != 3: errors.append(f"Wrong number of vertices {Vcount} != 3") - G.delete(vertices=["vertex5", "vertex6"], edges=["edge9"]) Ecount = G.V().outE().count().execute()[0]["count"] Vcount = G.V().count().execute()[0]["count"] diff --git a/endpoints/cypher/translate/build.go b/endpoints/cypher/translate/build.go index fd71ae27..e30f4773 100644 --- a/endpoints/cypher/translate/build.go +++ b/endpoints/cypher/translate/build.go @@ -66,7 +66,7 @@ func (c *cypherListener) BuildQuery() (*gripql.Query, error) { q = q.As(c.vertexPath[i].name) } if len(c.returns) > 0 { - log.Infof("Render: $" + c.returns[0] + "._data") + log.Infof("Render: %s", "$"+c.returns[0]+"._data") r := map[string]any{} for _, i := range c.returns { r[i] = "$" + i + "._data" diff --git a/engine/core/processors.go b/engine/core/processors.go index 0791421f..ed459b18 100644 --- a/engine/core/processors.go +++ b/engine/core/processors.go @@ -88,7 +88,7 @@ func (l *LookupVertsLabelIndex) Process(ctx context.Context, man gdbi.Manager, i defer close(out) for v := range l.db.GetVertexChannel(ctx, queryChan, l.loadData) { i := v.Ref - out <- i.AddCurrent(v.Vertex.Copy()) + out <- i.AddCurrent(v.Vertex) } }() return ctx @@ -511,7 +511,21 @@ func (s *MarkSelect) Process(ctx context.Context, man gdbi.Manager, in gdbi.InPi continue } m := t.GetMark(s.mark) - out <- t.AddCurrent(m) + n := t.AddCurrent(m) + // Select should count as a path step even when selecting the same element. + if len(n.GetPath()) == len(t.GetPath()) { + if bt, ok := n.(*gdbi.BaseTraveler); ok { + de := m.Get() + if de == nil { + bt.Path = append(bt.Path, gdbi.DataElementID{}) + } else if de.To != "" { + bt.Path = append(bt.Path, gdbi.DataElementID{Edge: de.ID}) + } else { + bt.Path = append(bt.Path, gdbi.DataElementID{Vertex: de.ID}) + } + } + } + out <- n } }() return ctx diff --git a/engine/core/processors_group.go b/engine/core/processors_group.go index 44095377..f28d38c0 100644 --- a/engine/core/processors_group.go +++ b/engine/core/processors_group.go @@ -17,16 +17,20 @@ type Group struct { func (r *Group) reduce(curTraveler *gdbi.BaseTraveler, newTraveler *gdbi.BaseTraveler) { for dest, field := range r.grouping { v := gdbi.TravelerPathLookup(newTraveler, field) - if curTraveler.Current != nil { - if a, ok := curTraveler.Current.Data[dest]; ok { + cur := curTraveler.GetCurrent().Get() + if cur != nil { + if cur.Data == nil { + cur.Data = map[string]any{} + } + if a, ok := cur.Data[dest]; ok { if aSlice, ok := a.([]any); ok { - curTraveler.Current.Data[dest] = append(aSlice, v) + cur.Data[dest] = append(aSlice, v) } else if !ok { // overwrite existing data - curTraveler.Current.Data[dest] = []any{v} + cur.Data[dest] = []any{v} } } else { - curTraveler.Current.Data[dest] = []any{v} + cur.Data[dest] = []any{v} } } } diff --git a/engine/pipeline/pipes.go b/engine/pipeline/pipes.go index 53f76a99..99d23432 100644 --- a/engine/pipeline/pipes.go +++ b/engine/pipeline/pipes.go @@ -7,6 +7,9 @@ package pipeline import ( "context" "fmt" + "runtime" + "sync" + "time" "github.com/bmeg/grip/engine" "github.com/bmeg/grip/engine/logic" @@ -98,7 +101,7 @@ func Start(ctx context.Context, pipe gdbi.Pipeline, man gdbi.Manager, bufsize in // Run starts a pipeline and converts the output to server output structures func Run(ctx context.Context, pipe gdbi.Pipeline, workdir string) <-chan *gripql.QueryResult { - bufsize := 5000 + bufsize := 20000 resch := make(chan *gripql.QueryResult, bufsize) go func() { defer close(resch) @@ -107,11 +110,44 @@ func Run(ctx context.Context, pipe gdbi.Pipeline, workdir string) <-chan *gripql markTypes := pipe.MarkTypes() man := engine.NewManager(workdir) rPipe := Start(ctx, pipe, man, bufsize, nil, nil) + var batch []gdbi.Traveler + var totalConverted int + pipelineStart := time.Now() for t := range rPipe.Outputs { if !t.IsSignal() { - resch <- Convert(graph, dataType, markTypes, t) + batch = append(batch, t) + if len(batch) >= bufsize { + convertStart := time.Now() + converted := BatchConvert(ctx, graph, dataType, markTypes, batch) + convertElapsed := time.Since(convertStart) + var emitted int + for _, c := range converted { + if c != nil { + resch <- c + emitted++ + } + } + totalConverted += emitted + log.Debugf("pipeline.Run batch dataType=%s in=%d out=%d convert=%s", dataType, len(batch), emitted, convertElapsed.Round(time.Millisecond)) + batch = nil + } + } + } + if len(batch) > 0 { + convertStart := time.Now() + converted := BatchConvert(ctx, graph, dataType, markTypes, batch) + convertElapsed := time.Since(convertStart) + var emitted int + for _, c := range converted { + if c != nil { + resch <- c + emitted++ + } } + totalConverted += emitted + log.Debugf("pipeline.Run tail dataType=%s in=%d out=%d convert=%s", dataType, len(batch), emitted, convertElapsed.Round(time.Millisecond)) } + log.Debugf("pipeline.Run complete dataType=%s out=%d elapsed=%s", dataType, totalConverted, time.Since(pipelineStart).Round(time.Millisecond)) man.Cleanup() }() return resch @@ -119,7 +155,7 @@ func Run(ctx context.Context, pipe gdbi.Pipeline, workdir string) <-chan *gripql // Run starts a pipeline and converts the output to server output structures func Resume(ctx context.Context, pipe gdbi.Pipeline, workdir string, input gdbi.InPipe, cancel func()) <-chan *gripql.QueryResult { - bufsize := 5000 + bufsize := 20000 resch := make(chan *gripql.QueryResult, bufsize) go func() { defer close(resch) @@ -130,11 +166,44 @@ func Resume(ctx context.Context, pipe gdbi.Pipeline, workdir string, input gdbi. log.Debugf("resuming: out %s", dataType) rPipe := Start(ctx, pipe, man, bufsize, input, cancel) if rPipe != nil { + var batch []gdbi.Traveler + var totalConverted int + pipelineStart := time.Now() for t := range rPipe.Outputs { if !t.IsSignal() { - resch <- Convert(graph, dataType, markTypes, t) + batch = append(batch, t) + if len(batch) >= bufsize { + convertStart := time.Now() + converted := BatchConvert(ctx, graph, dataType, markTypes, batch) + convertElapsed := time.Since(convertStart) + var emitted int + for _, c := range converted { + if c != nil { + resch <- c + emitted++ + } + } + totalConverted += emitted + log.Debugf("pipeline.Resume batch dataType=%s in=%d out=%d convert=%s", dataType, len(batch), emitted, convertElapsed.Round(time.Millisecond)) + batch = nil + } } } + if len(batch) > 0 { + convertStart := time.Now() + converted := BatchConvert(ctx, graph, dataType, markTypes, batch) + convertElapsed := time.Since(convertStart) + var emitted int + for _, c := range converted { + if c != nil { + resch <- c + emitted++ + } + } + totalConverted += emitted + log.Debugf("pipeline.Resume tail dataType=%s in=%d out=%d convert=%s", dataType, len(batch), emitted, convertElapsed.Round(time.Millisecond)) + } + log.Debugf("pipeline.Resume complete dataType=%s out=%d elapsed=%s", dataType, totalConverted, time.Since(pipelineStart).Round(time.Millisecond)) if debug { rPipe.Logger.Log() } @@ -153,11 +222,11 @@ func Convert(graph gdbi.GraphInterface, dataType gdbi.DataType, markTypes map[st ve := ver.Get() if ve != nil { if !ve.Loaded { - //log.Infof("Loading output vertex: %s", ve.ID) - //TODO: doing single vertex queries is slow. - // Need to rework this to do batched queries ve = graph.GetVertex(ve.ID, true) } + if ve == nil { + return nil + } return &gripql.QueryResult{ Result: &gripql.QueryResult_Vertex{ Vertex: ve.ToVertex(), @@ -176,6 +245,9 @@ func Convert(graph gdbi.GraphInterface, dataType gdbi.DataType, markTypes map[st if !ee.Loaded { ee = graph.GetEdge(ee.ID, true) } + if ee == nil { + return nil + } return &gripql.QueryResult{ Result: &gripql.QueryResult_Edge{ Edge: ee.ToEdge(), @@ -239,3 +311,175 @@ func Convert(graph gdbi.GraphInterface, dataType gdbi.DataType, markTypes map[st } return nil } + +func BatchConvert(ctx context.Context, graph gdbi.GraphInterface, dataType gdbi.DataType, markTypes map[string]gdbi.DataType, travelers []gdbi.Traveler) []*gripql.QueryResult { + if len(travelers) == 0 { + return nil + } + results := make([]*gripql.QueryResult, len(travelers)) + + if dataType == gdbi.VertexData { + type vertexResult struct { + idx int + ve *gdbi.DataElement + } + loadedVerts := make([]vertexResult, 0, len(travelers)) + reqChan := make(chan gdbi.ElementLookup, len(travelers)) + pending := 0 + for i, t := range travelers { + ver := t.GetCurrent() + if ver != nil { + ve := ver.Get() + if ve != nil { + if !ve.Loaded { + reqChan <- gdbi.ElementLookup{ID: ve.ID, Ref: t} + pending++ + } else { + loadedVerts = append(loadedVerts, vertexResult{idx: i, ve: ve}) + } + } + } + } + close(reqChan) + + if pending > 0 { + tToIdx := make(map[gdbi.Traveler]int) + for i, t := range travelers { + tToIdx[t] = i + } + + outChan := graph.GetVertexChannel(ctx, reqChan, true) + for lookup := range outChan { + idx := tToIdx[lookup.Ref] + if lookup.Vertex != nil { + loadedVerts = append(loadedVerts, vertexResult{idx: idx, ve: lookup.Vertex.Get()}) + } + } + } + + workers := runtime.GOMAXPROCS(0) + if workers < 1 { + workers = 1 + } + if workers > len(loadedVerts) { + workers = len(loadedVerts) + } + if workers <= 1 { + for _, item := range loadedVerts { + if item.ve == nil { + continue + } + results[item.idx] = &gripql.QueryResult{ + Result: &gripql.QueryResult_Vertex{ + Vertex: item.ve.ToVertex(), + }, + } + } + } else { + jobs := make(chan vertexResult, workers*2) + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for item := range jobs { + if item.ve == nil { + continue + } + results[item.idx] = &gripql.QueryResult{ + Result: &gripql.QueryResult_Vertex{ + Vertex: item.ve.ToVertex(), + }, + } + } + }() + } + for _, item := range loadedVerts { + jobs <- item + } + close(jobs) + wg.Wait() + } + } else if dataType == gdbi.EdgeData { + type edgeResult struct { + idx int + ee *gdbi.DataElement + } + loadedEdges := make([]edgeResult, 0, len(travelers)) + for i, t := range travelers { + eer := t.GetCurrent() + if eer == nil { + continue + } + ee := eer.Get() + if ee == nil { + continue + } + loadedEdges = append(loadedEdges, edgeResult{idx: i, ee: ee}) + } + + workers := runtime.GOMAXPROCS(0) + if workers < 1 { + workers = 1 + } + if workers > len(loadedEdges) { + workers = len(loadedEdges) + } + if workers <= 1 { + for _, item := range loadedEdges { + if item.ee == nil { + continue + } + if !item.ee.Loaded { + loaded := graph.GetEdge(item.ee.ID, true) + if loaded == nil { + continue + } + item.ee = loaded + } + results[item.idx] = &gripql.QueryResult{ + Result: &gripql.QueryResult_Edge{ + Edge: item.ee.ToEdge(), + }, + } + } + } else { + jobs := make(chan edgeResult, workers*2) + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for item := range jobs { + if item.ee == nil { + continue + } + ee := item.ee + if !ee.Loaded { + loaded := graph.GetEdge(ee.ID, true) + if loaded == nil { + continue + } + ee = loaded + } + results[item.idx] = &gripql.QueryResult{ + Result: &gripql.QueryResult_Edge{ + Edge: ee.ToEdge(), + }, + } + } + }() + } + for _, item := range loadedEdges { + jobs <- item + } + close(jobs) + wg.Wait() + } + } else { + for i, t := range travelers { + results[i] = Convert(graph, dataType, markTypes, t) + } + } + return results +} diff --git a/gdbi/interface.go b/gdbi/interface.go index 1d37686e..470ad517 100644 --- a/gdbi/interface.go +++ b/gdbi/interface.go @@ -85,6 +85,7 @@ type Signal struct { // Traveler is a query element that traverse the graph type BaseTraveler struct { Current *DataElement + currentRef DataRef Marks map[string]*DataElement Selections map[string]*DataElement Aggregation *Aggregate @@ -137,6 +138,7 @@ type ElementLookup struct { Ref Traveler Vertex VertexRef Edge EdgeRef + Priv any } // GraphDB is the base interface for graph databases diff --git a/gdbi/pipeline.go b/gdbi/pipeline.go index 32fe60a7..149b0035 100644 --- a/gdbi/pipeline.go +++ b/gdbi/pipeline.go @@ -10,6 +10,7 @@ type PipelineState interface { GetLastType() DataType SetLastType(DataType) StepLoadData() bool + StepRequiredFields() []string } type CustomProcGen interface { diff --git a/gdbi/state.go b/gdbi/state.go index c8824ff7..fc78cae7 100644 --- a/gdbi/state.go +++ b/gdbi/state.go @@ -11,6 +11,7 @@ type State struct { MarkTypes map[string]DataType Steps []string StepOutputs map[string][]string + StepFields map[string][]string CurStep string } @@ -19,6 +20,9 @@ func (ps *State) SetCurStatment(a int) { } func (ps *State) StepLoadData() bool { + if x, ok := ps.StepFields[ps.CurStep]; ok && len(x) > 0 { + return true + } if x, ok := ps.StepOutputs[ps.CurStep]; ok { if len(x) == 1 && x[0] == "_label" { return false @@ -28,6 +32,15 @@ func (ps *State) StepLoadData() bool { return false } +func (ps *State) StepRequiredFields() []string { + if x, ok := ps.StepFields[ps.CurStep]; ok { + out := make([]string, 0, len(x)) + out = append(out, x...) + return out + } + return nil +} + func (ps *State) GetLastType() DataType { return ps.LastType } @@ -39,11 +52,13 @@ func (ps *State) SetLastType(a DataType) { func NewPipelineState(stmts []*gripql.GraphStatement, storeMarks bool) *State { steps := inspect.PipelineSteps(stmts) stepOut := inspect.PipelineStepOutputs(stmts, storeMarks) + stepFields := inspect.PipelineStepRequiredFields(stmts) return &State{ LastType: NoData, MarkTypes: map[string]DataType{}, Steps: steps, StepOutputs: stepOut, + StepFields: stepFields, } } diff --git a/gdbi/traveler.go b/gdbi/traveler.go index b5162e13..2478e70c 100644 --- a/gdbi/traveler.go +++ b/gdbi/traveler.go @@ -2,9 +2,22 @@ package gdbi import ( "github.com/bmeg/grip/gdbi/tpath" - "github.com/bmeg/grip/util/copy" ) +type identityRef interface { + Identity() *DataElement +} + +func dataRefIdentity(r DataRef) *DataElement { + if r == nil { + return nil + } + if ir, ok := r.(identityRef); ok { + return ir.Identity() + } + return r.Get() +} + // These consts mark the type of a Pipeline traveler chan const ( // StateCustom The Pipeline will be emitting custom data structures @@ -23,52 +36,73 @@ const ( // AddCurrent creates a new copy of the travel with new 'current' value func (t *BaseTraveler) AddCurrent(r DataRef) Traveler { - o := BaseTraveler{ - Marks: map[string]*DataElement{}, - Path: make([]DataElementID, len(t.Path)+1), - Signal: t.Signal, - } - for k, v := range t.Marks { - o.Marks[k] = v - } - for i := range t.Path { - o.Path[i] = t.Path[i] - } + o := *t // Copy struct values (Marks, Path, etc. pointers are shared) if r != nil { - rd := r.Get() + o.currentRef = r.Copy() + o.Current = dataRefIdentity(o.currentRef) + if o.Current == nil { + o.Current = o.currentRef.Get() + } + prev := t.Current + if prev == nil { + prev = dataRefIdentity(t.currentRef) + } + + // Some transform processors emit a DataElement with only Data set. + // Treat that as the same current element identity. + if prev != nil && o.Current != nil && o.Current.ID == "" && o.Current.From == "" && o.Current.To == "" { + o.Current.ID = prev.ID + o.Current.From = prev.From + o.Current.To = prev.To + if o.Current.Label == "" { + o.Current.Label = prev.Label + } + } + + // Preserve existing path when current element identity does not change. + if prev != nil && o.Current != nil && + prev.ID == o.Current.ID && + prev.From == o.Current.From && + prev.To == o.Current.To { + if t.Path != nil { + o.Path = make([]DataElementID, len(t.Path)) + copy(o.Path, t.Path) + } + return &o + } + + // Bootstrap path tracking at the first traversal hop and append on each move. + pathLen := len(t.Path) + o.Path = make([]DataElementID, pathLen+1) + copy(o.Path, t.Path) + rd := o.Current if rd == nil { - o.Path[len(t.Path)] = DataElementID{} + o.Path[pathLen] = DataElementID{} } else if rd.To != "" { - o.Path[len(t.Path)] = DataElementID{Edge: rd.ID} + o.Path[pathLen] = DataElementID{Edge: rd.ID} } else { - o.Path[len(t.Path)] = DataElementID{Vertex: rd.ID} + o.Path[pathLen] = DataElementID{Vertex: rd.ID} } - o.Current = r.Get() } return &o } -// AddCurrent creates a new copy of the travel with new 'current' value +// Copy creates a new copy of the traveler func (t *BaseTraveler) Copy() Traveler { - o := BaseTraveler{ - Marks: map[string]*DataElement{}, - Path: make([]DataElementID, len(t.Path)), - Signal: t.Signal, + o := *t + if t.currentRef != nil { + o.currentRef = t.currentRef.Copy() } - for k, v := range t.Marks { - vg := v.Get() - o.Marks[k] = &DataElement{ - ID: vg.ID, - Label: vg.Label, - From: vg.From, To: vg.To, - Data: copy.DeepCopy(vg.Data).(map[string]interface{}), - Loaded: vg.Loaded, + if len(t.Marks) > 0 { + o.Marks = make(map[string]*DataElement, len(t.Marks)) + for k, v := range t.Marks { + o.Marks[k] = v // Shallow copy of DataElement is fine as they are usually immutable } } - for i := range t.Path { - o.Path[i] = t.Path[i] + if len(t.Path) > 0 { + o.Path = make([]DataElementID, len(t.Path)) + copy(o.Path, t.Path) } - o.Current = t.Current return &o } @@ -84,11 +118,14 @@ func (tr *BaseTraveler) IsSignal() bool { } func (tr *BaseTraveler) IsNull() bool { - return tr.Current == nil + return tr.Current == nil && tr.currentRef == nil } // HasMark checks to see if a results is stored in a travelers statemap func (t *BaseTraveler) HasMark(label string) bool { + if t.Marks == nil { + return false + } _, ok := t.Marks[label] return ok } @@ -104,38 +141,54 @@ func (t *BaseTraveler) ListMarks() []string { // AddMark adds a result to travels state map using `label` as the name func (t *BaseTraveler) AddMark(label string, r DataRef) Traveler { - o := BaseTraveler{Marks: map[string]*DataElement{}, Path: make([]DataElementID, len(t.Path))} + o := *t + o.Marks = make(map[string]*DataElement, len(t.Marks)+1) for k, v := range t.Marks { o.Marks[k] = v } o.Marks[label] = r.Get() - for i := range t.Path { - o.Path[i] = t.Path[i] - } - o.Current = t.Current return &o } func (t *BaseTraveler) UpdateMark(label string, r DataRef) { if label == tpath.CURRENT { - t.Current = r.Get() + t.currentRef = r.Copy() + t.Current = dataRefIdentity(t.currentRef) + if t.Current == nil { + t.Current = t.currentRef.Get() + } return } + if t.Marks == nil { + t.Marks = map[string]*DataElement{} + } t.Marks[label] = r.Get() } // GetMark gets stored result in travels state using its label func (t *BaseTraveler) GetMark(label string) DataRef { + if t.Marks == nil { + return nil + } return t.Marks[label] } // GetCurrent get current result value attached to the traveler func (t *BaseTraveler) GetCurrent() DataRef { + if t.currentRef != nil { + return t.currentRef + } return t.Current } func (t *BaseTraveler) GetCurrentID() string { - return t.Current.Get().ID + if t.Current == nil { + if cur := dataRefIdentity(t.currentRef); cur != nil { + return cur.ID + } + return "" + } + return t.Current.ID } func (t *BaseTraveler) GetCount() uint32 { diff --git a/gdbi/traveler_doc.go b/gdbi/traveler_doc.go index 4aab7c89..9d573b60 100644 --- a/gdbi/traveler_doc.go +++ b/gdbi/traveler_doc.go @@ -172,32 +172,28 @@ KeyLoop: } } - var out Traveler = &BaseTraveler{} + cde := t.GetCurrent().Get() + var out Traveler = t.Copy() out = out.AddCurrent(&DataElement{ - Data: map[string]interface{}{}, + ID: cde.ID, + Label: cde.Label, + From: cde.From, + To: cde.To, + Data: map[string]interface{}{}, }) - for _, mark := range t.ListMarks() { - out = out.AddMark(mark, t.GetMark(mark)) - } - - var cde *DataElement - var ode *DataElement - - cde = t.GetCurrent().Get() - ode = out.GetCurrent().Get() + ode := out.GetCurrent().Get() if len(excludePaths) > 0 { cde = excludeFields(cde, excludePaths) for k, v := range cde.Data { ode.Data[k] = v } + ode.ID = cde.ID + ode.Label = cde.Label + ode.From = cde.From + ode.To = cde.To } - ode.ID = cde.ID - ode.Label = cde.Label - ode.From = cde.From - ode.To = cde.To - if len(includePaths) > 0 { ode = includeFields(ode, cde, includePaths) } diff --git a/go.mod b/go.mod index 807fa9dd..b7f2e58f 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,7 @@ require ( github.com/Workiva/go-datastructures v1.1.5 github.com/akrylysov/pogreb v0.10.2 github.com/antlr/antlr4/runtime/Go/antlr v1.4.10 - github.com/bmeg/benchtop v0.0.0-20251027212658-046a256eb6fa + github.com/bmeg/benchtop v0.0.0-20260306193933-7cdd75fcb2fb github.com/bmeg/jsonpath v0.0.0-20210207014051-cca5355553ad github.com/bmeg/jsonschema/v6 v6.0.4 github.com/bmeg/jsonschemagraph v0.0.4-0.20251017205345-236d2de9887c @@ -62,6 +62,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/internal v1.10.0 // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.3.2 // indirect github.com/DataDog/zstd v1.5.7 // indirect + github.com/apache/arrow/go/v18 v18.0.0-20241007013041-ab95a4d25142 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bytedance/sonic/loader v0.3.0 // indirect github.com/casbin/govaluate v1.2.0 // indirect @@ -91,6 +92,7 @@ require ( github.com/golang-jwt/jwt/v5 v5.2.1 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/golang/snappy v0.0.5-0.20231225225746-43d5d4cd4e0e // indirect + github.com/google/flatbuffers v24.3.25+incompatible // indirect github.com/google/pprof v0.0.0-20240711041743-f6c9dda6c6da // indirect github.com/google/uuid v1.6.0 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect @@ -129,18 +131,22 @@ require ( github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 // indirect github.com/rogpeppe/go-internal v1.12.0 // indirect github.com/rs/xid v1.5.0 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/pflag v1.0.6 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/xdg-go/pbkdf2 v1.0.0 // indirect github.com/xdg-go/scram v1.1.2 // indirect github.com/xdg-go/stringprep v1.0.4 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + github.com/zeebo/xxh3 v1.0.2 // indirect + go.etcd.io/bbolt v1.4.3 // indirect golang.org/x/arch v0.0.0-20210923205945-b76863e36670 // indirect golang.org/x/crypto v0.36.0 // indirect golang.org/x/exp v0.0.0-20240707233637-46b078467d37 // indirect + golang.org/x/mod v0.20.0 // indirect golang.org/x/sys v0.33.0 // indirect golang.org/x/text v0.23.0 // indirect - gonum.org/v1/gonum v0.8.2 // indirect + golang.org/x/tools v0.24.0 // indirect + golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250811230008-5f3141c8851a // indirect gopkg.in/sourcemap.v1 v1.0.5 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect diff --git a/go.sum b/go.sum index 372d7c13..f5634b24 100644 --- a/go.sum +++ b/go.sum @@ -26,17 +26,22 @@ github.com/Shopify/toxiproxy/v2 v2.5.0 h1:i4LPT+qrSlKNtQf5QliVjdP08GyAH8+BUIc9gT github.com/Shopify/toxiproxy/v2 v2.5.0/go.mod h1:yhM2epWtAmel9CB8r2+L+PCmhH6yH2pITaPAo7jxJl0= github.com/Workiva/go-datastructures v1.1.5 h1:5YfhQ4ry7bZc2Mc7R0YZyYwpf5c6t1cEFvdAhd6Mkf4= github.com/Workiva/go-datastructures v1.1.5/go.mod h1:1yZL+zfsztete+ePzZz/Zb1/t5BnDuE2Ya2MMGhzP6A= -github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= github.com/akrylysov/pogreb v0.10.2 h1:e6PxmeyEhWyi2AKOBIJzAEi4HkiC+lKyCocRGlnDi78= github.com/akrylysov/pogreb v0.10.2/go.mod h1:pNs6QmpQ1UlTJKDezuRWmaqkgUE2TuU0YTWyqJZ7+lI= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/antlr/antlr4/runtime/Go/antlr v1.4.10 h1:yL7+Jz0jTC6yykIK/Wh74gnTJnrGr5AyrNMXuA0gves= github.com/antlr/antlr4/runtime/Go/antlr v1.4.10/go.mod h1:F7bn7fEU90QkQ3tnmaTx3LTKLEDqnwWODIYppRQ5hnY= +github.com/apache/arrow/go/v18 v18.0.0-20241007013041-ab95a4d25142 h1:6EtsUpu9/vLtVl6oVpFiZe9GRax7STd2bG55VNwsRdI= +github.com/apache/arrow/go/v18 v18.0.0-20241007013041-ab95a4d25142/go.mod h1:GjCnS5QddrJzyqrdYqCUvwlND7SfAw4WH/722M2U2NM= +github.com/apache/thrift v0.20.0 h1:631+KvYbsBZxmuJjYwhezVsrfc/TbqtZV4QcxOX1fOI= +github.com/apache/thrift v0.20.0/go.mod h1:hOk1BQqcp2OLzGsyVXdfMk7YFlMxK3aoEVhjD06QhB8= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bmeg/benchtop v0.0.0-20251027212658-046a256eb6fa h1:8gqN6aRKHYkAQGXr8bdOquCl6gzn42jl31aUtznYJlY= -github.com/bmeg/benchtop v0.0.0-20251027212658-046a256eb6fa/go.mod h1:mKIXKgNg/q55XrsWKAeWBI9aeSV9yep6tdqaZYHkDcw= +github.com/bmeg/benchtop v0.0.0-20260306193933-7cdd75fcb2fb h1:WlotSQOFfCR3oPkKLbl4qljFZDqH46shJRxT6anDJGg= +github.com/bmeg/benchtop v0.0.0-20260306193933-7cdd75fcb2fb/go.mod h1:oTpEABxCCloUfzCrhLWQ56J1xliB2kmQ8Xxy82g3DKY= github.com/bmeg/jsonpath v0.0.0-20210207014051-cca5355553ad h1:ICgBexeLB7iv/IQz4rsP+MimOXFZUwWSPojEypuOaQ8= github.com/bmeg/jsonpath v0.0.0-20210207014051-cca5355553ad/go.mod h1:ft96Irkp72C7ZrUWRenG7LrF0NKMxXdRvsypo5Njhm4= github.com/bmeg/jsonschema/v6 v6.0.4 h1:AXFAz7G05VZkKretSSU+uacMKF8+C16ONG6pzFzzA7E= @@ -124,7 +129,6 @@ github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4= github.com/fatih/color v1.17.0/go.mod h1:YZ7TlrGPkiz6ku9fK3TLD/pl3CpsiFyu8N92HLgmosI= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= @@ -156,7 +160,6 @@ github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt/v5 v5.2.1 h1:OuVbFODueb089Lh128TAcimifWaLhJwVflnrgM17wHk= github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= -github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.4.4 h1:l75CXGRSwbaYNpl/Z2X1XIIAMSCquvXgpVZDhwEIJsc= @@ -177,6 +180,8 @@ github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8l github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.5-0.20231225225746-43d5d4cd4e0e h1:4bw4WeyTYPp0smaXiJZCNnLrvVBqirQVreixayXezGc= github.com/golang/snappy v0.0.5-0.20231225225746-43d5d4cd4e0e/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI= +github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= @@ -239,13 +244,14 @@ github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= -github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/keybase/go-keychain v0.0.0-20231219164618-57a3676c3af6 h1:IsMZxCuZqKuao2vNdfD82fjjgPLfyHLpR41Z88viRWs= github.com/keybase/go-keychain v0.0.0-20231219164618-57a3676c3af6/go.mod h1:3VeWNIJaW+O5xpRQbPp0Ybqu1vJd/pm7s2F473HRrkw= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= +github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= github.com/klauspost/compress v1.12.3/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= @@ -285,6 +291,10 @@ github.com/mattn/go-sqlite3 v1.14.23 h1:gbShiuAP1W5j9UOksQ06aiiqPMxYecovVGwmTxWt github.com/mattn/go-sqlite3 v1.14.23/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/maypok86/otter/v2 v2.1.0 h1:H+FO9NtLuSWYUlIUQ/kT6VNEpWSIF4w4GZJRDhxYb7k= github.com/maypok86/otter/v2 v2.1.0/go.mod h1:jX2xEKz9PrNVbDqnk8JUuOt5kURK8h7jd1kDYI5QsZk= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= github.com/minio/minio-go/v7 v7.0.73 h1:qr2vi96Qm7kZ4v7LLebjte+MQh621fFWnv93p12htEo= @@ -377,8 +387,9 @@ github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -413,6 +424,12 @@ github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfS github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= +go.etcd.io/bbolt v1.4.3 h1:dEadXpI6G79deX5prL3QRNP6JB8UxVkqo4UPnHaNXJo= +go.etcd.io/bbolt v1.4.3/go.mod h1:tKQlpPaYCVFctUIgFKFnAlvbmB3tpy1vkTnDWohtc0E= go.mongodb.org/mongo-driver v1.17.0 h1:Hp4q2MCjvY19ViwimTs00wHi7G4yzxh4/2+nTx8r40k= go.mongodb.org/mongo-driver v1.17.0/go.mod h1:wwWm/+BuOddhcq3n68LKRmgk2wXzmF6s0SFOa0GINL4= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= @@ -442,12 +459,9 @@ golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58 golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34= golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20240707233637-46b078467d37 h1:uLDX+AfeFCct3a2C7uIWBKMJIR3CJMhcgfrUAqjRK6w= golang.org/x/exp v0.0.0-20240707233637-46b078467d37/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= -golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= @@ -455,6 +469,8 @@ golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHl golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -528,7 +544,6 @@ golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= @@ -539,17 +554,18 @@ golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roY golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= +golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU= +golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= -gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM= -gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= +gonum.org/v1/gonum v0.15.0 h1:2lYxjRbTYyxkJxlhC+LvJIx3SsANPdRybu1tGj9/OrQ= +gonum.org/v1/gonum v0.15.0/go.mod h1:xzZVBJBtS+Mz4q0Yl2LJTk+OxOg4jiXZ7qBoM0uISGo= gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= -gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= -gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= @@ -598,6 +614,5 @@ gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50= -rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/grids/bulk_add_skip_existing_test.go b/grids/bulk_add_skip_existing_test.go new file mode 100644 index 00000000..73fdbda2 --- /dev/null +++ b/grids/bulk_add_skip_existing_test.go @@ -0,0 +1,142 @@ +package grids + +import ( + "testing" + + "github.com/bmeg/grip/gdbi" +) + +func runBulkAdd(t *testing.T, g *Graph, elems ...*gdbi.GraphElement) { + t.Helper() + ch := make(chan *gdbi.GraphElement, len(elems)) + for _, e := range elems { + ch <- e + } + close(ch) + if err := g.BulkAdd(ch); err != nil { + t.Fatalf("BulkAdd failed: %v", err) + } +} + +func TestBulkAddSkipsExistingVertexIDs(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + defer dbi.Close() + + if err := dbi.AddGraph("g"); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph("g") + if err != nil { + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + // First insert. + runBulkAdd(t, g, &gdbi.GraphElement{ + Vertex: &gdbi.Vertex{ + ID: "v1", + Label: "person", + Data: map[string]any{"name": "first"}, + }, + }) + + // Second insert with same ID should be skipped (insert-only semantics). + runBulkAdd(t, g, &gdbi.GraphElement{ + Vertex: &gdbi.Vertex{ + ID: "v1", + Label: "person", + Data: map[string]any{"name": "second"}, + }, + }) + + v := g.GetVertex("v1", true) + if v == nil { + t.Fatalf("expected vertex v1 to exist") + } + gotName, _ := v.Data["name"].(string) + if gotName != "first" { + t.Fatalf("expected duplicate insert to be skipped; name=%q want=%q", gotName, "first") + } + + table, err := g.driver.GetOrLoadTable("v_person") + if err != nil { + t.Fatalf("getOrLoadTable(v_person) failed: %v", err) + } + count := 0 + for range table.ScanId(nil) { + count++ + } + if count != 1 { + t.Fatalf("expected exactly one physical row for duplicate ID; got=%d want=1", count) + } +} + +func TestBulkAddSkipsIntraStreamDuplicates(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + defer dbi.Close() + + if err := dbi.AddGraph("g"); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph("g") + if err != nil { + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + // Feed duplicates into the SAME stream + runBulkAdd(t, g, + &gdbi.GraphElement{ + Vertex: &gdbi.Vertex{ + ID: "v1", + Label: "person", + Data: map[string]any{"name": "first"}, + }, + }, + &gdbi.GraphElement{ + Vertex: &gdbi.Vertex{ + ID: "v1", + Label: "person", + Data: map[string]any{"name": "second"}, + }, + }, + ) + + v := g.GetVertex("v1", true) + if v == nil { + t.Fatalf("expected vertex v1 to exist") + } + // The first one should win if batch deduplication works, or second if not? + // Actually, since they are in the same batch, the map logic `if _, ok := batchSeen[id]; ok` + // means the FIRST one encountered in the batch wins and subsequent ones are skipped. + gotName, _ := v.Data["name"].(string) + if gotName != "first" { + t.Fatalf("expected first insert in batch to win; name=%q want=%q", gotName, "first") + } + + table, err := g.driver.GetOrLoadTable("v_person") + if err != nil { + t.Fatalf("getOrLoadTable(v_person) failed: %v", err) + } + count := 0 + for range table.ScanId(nil) { + count++ + } + if count != 1 { + t.Fatalf("expected exactly one physical row for duplicate ID in same stream; got=%d want=1", count) + } +} diff --git a/grids/config.go b/grids/config.go new file mode 100644 index 00000000..b520880b --- /dev/null +++ b/grids/config.go @@ -0,0 +1,18 @@ +package grids + +import "runtime" + +type Config struct { + GraphDir string + BulkLoaderWorkers int + Driver string +} + +func (c *Config) SetDefaults() { + if c.BulkLoaderWorkers == 0 { + c.BulkLoaderWorkers = runtime.NumCPU() + } + if c.Driver == "" { + c.Driver = "jsontable" + } +} diff --git a/grids/driver/driver.go b/grids/driver/driver.go new file mode 100644 index 00000000..fac47662 --- /dev/null +++ b/grids/driver/driver.go @@ -0,0 +1,1167 @@ +package driver + +import ( + "bytes" + "context" + "encoding/binary" + "errors" + "fmt" + "sort" + "strings" + "sync" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/arrowdriver" + bFilters "github.com/bmeg/benchtop/filters" + "github.com/bmeg/benchtop/jsontable" + "github.com/bmeg/benchtop/jsontable/tpath" + "github.com/bmeg/benchtop/pebblebulk" + "github.com/bmeg/benchtop/query" + "github.com/bmeg/benchtop/util" + "github.com/bmeg/grip/grids/driver/indexer" + "github.com/bmeg/grip/grids/key" + "github.com/bmeg/grip/log" + "github.com/cockroachdb/pebble" +) + +var ErrNotFound = errors.New("row not found in any table") + +type IDInfo struct { + Label string + Loc *benchtop.RowLoc + Data map[string]any +} + +type BackendTable struct { + Name string + Label string + TableId uint16 + Fields map[string]struct{} + benchtop.TableStore +} + +type FieldInfo struct { + Label string + Field string +} + +type GridKVDriver struct { + Lock sync.RWMutex + PebbleLock sync.RWMutex + Pkv *pebblebulk.PebbleKV + closePkv func() error + Tables map[string]*BackendTable + TablesByID map[uint16]*BackendTable + TableDr benchtop.TableDriver + + // ID mapping state (volatile or using pebble) + idMapMu sync.Mutex + idMap map[string]uint64 + ridMap map[uint64]string + nextID uint64 +} + +func tableLabelFromName(name string) string { + if strings.HasPrefix(name, "v_") || strings.HasPrefix(name, "e_") { + return name[2:] + } + return name +} + +func columnDefsToFieldSet(cols []benchtop.ColumnDef) map[string]struct{} { + fields := make(map[string]struct{}, len(cols)) + for _, col := range cols { + fields[col.Key] = struct{}{} + } + return fields +} + +func newBackendTable(name string, tableID uint16, store benchtop.TableStore) *BackendTable { + return &BackendTable{ + Name: name, + Label: tableLabelFromName(name), + TableId: tableID, + Fields: columnDefsToFieldSet(store.GetColumnDefs()), + TableStore: store, + } +} + +func NewGridKVDriver(path string, driver string) (*GridKVDriver, error) { + if driver == "" { + driver = "jsontable" + } + driver = strings.ToLower(driver) + log.Infof("Initializing grids benchtop backend driver=%s path=%s", driver, path) + + var td benchtop.TableDriver + var pkv *pebblebulk.PebbleKV + var closePkv func() error + var err error + switch driver { + case "jsontable", "json": + td, err = jsontable.NewJSONDriver(path) + if err != nil { + return nil, err + } + rawKV := td.GetKV() + typedKV, ok := rawKV.(*pebblebulk.PebbleKV) + if !ok || typedKV == nil { + td.Close() + return nil, fmt.Errorf("jsontable driver returned unsupported KV type %T", rawKV) + } + pkv = typedKV + case "arrow": + pkv, err = pebblebulk.NewPebbleKV(path) + if err != nil { + return nil, err + } + closePkv = pkv.Close + td, err = arrowdriver.NewArrowDriver(path) + if err != nil { + pkv.Close() + return nil, err + } + default: + return nil, fmt.Errorf("unsupported grids table driver %q; supported drivers: jsontable, arrow", driver) + } + + dr := &GridKVDriver{ + Lock: sync.RWMutex{}, + PebbleLock: sync.RWMutex{}, + TableDr: td, + Pkv: pkv, + closePkv: closePkv, + Tables: make(map[string]*BackendTable), + TablesByID: make(map[uint16]*BackendTable), + idMap: make(map[string]uint64), + ridMap: make(map[uint64]string), + } + + // Load existing ID mapping stats + val, closer, err := dr.Pkv.Get(benchtop.MaxIDKey) + if err == nil { + dr.nextID = binary.BigEndian.Uint64(val) + closer.Close() + } else { + dr.nextID = 1 + } + + // We no longer PreloadCache as locations are embedded in structural keys. + // But we MUST discover which tables exist so label scans work. + for _, tableName := range dr.TableDr.List() { + if _, err := dr.GetOrLoadTable(tableName); err != nil { + log.Errorf("Failed to discover table %s: %v", tableName, err) + } + } + if err := dr.LoadFields(); err != nil { + dr.Close() + return nil, err + } + + return dr, nil +} + +func (dr *GridKVDriver) GetID(s string) (uint64, error) { + ids, err := dr.GetIDs([]string{s}) + if err != nil { + return 0, err + } + return ids[0], nil +} + +func (dr *GridKVDriver) GetIDs(ids []string) ([]uint64, error) { + out := make([]uint64, len(ids)) + remaining := make(map[int]string) + + dr.idMapMu.Lock() + for i, s := range ids { + if id, ok := dr.idMap[s]; ok { + out[i] = id + } else { + remaining[i] = s + } + } + dr.idMapMu.Unlock() + + if len(remaining) == 0 { + return out, nil + } + + // Fetch missing from Pebble + err := dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for i, s := range remaining { + ikey := key.StringToIDKey(s) + val, err := it.Get(ikey) + if err == nil { + id := binary.BigEndian.Uint64(val) + out[i] = id + delete(remaining, i) + } + } + return nil + }) + if err != nil { + return nil, err + } + + if len(remaining) == 0 { + // Update cache + dr.idMapMu.Lock() + for i, id := range out { + dr.idMap[ids[i]] = id + dr.ridMap[id] = ids[i] + } + dr.idMapMu.Unlock() + return out, nil + } + + // Create new IDs for orphans + err = dr.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + dr.idMapMu.Lock() + defer dr.idMapMu.Unlock() + + for i, s := range remaining { + // Double check if someone else created it + ikey := key.StringToIDKey(s) + id := dr.nextID + dr.nextID++ + + idBytes := make([]byte, 8) + binary.BigEndian.PutUint64(idBytes, id) + if err := tx.Set(ikey, idBytes, nil); err != nil { + return err + } + if err := tx.Set(key.IDToStringKey(id), []byte(s), nil); err != nil { + return err + } + out[i] = id + dr.idMap[s] = id + dr.ridMap[id] = s + } + + maxBytes := make([]byte, 8) + binary.BigEndian.PutUint64(maxBytes, dr.nextID) + if err := tx.Set(benchtop.MaxIDKey, maxBytes, nil); err != nil { + return err + } + return nil + }) + + if err != nil { + return nil, err + } + + return out, nil +} + +func (dr *GridKVDriver) TranslateID(id uint64) (string, error) { + dr.idMapMu.Lock() + if s, ok := dr.ridMap[id]; ok { + dr.idMapMu.Unlock() + return s, nil + } + dr.idMapMu.Unlock() + + rkey := key.IDToStringKey(id) + val, closer, err := dr.Pkv.Get(rkey) + if err != nil { + return "", err + } + defer closer.Close() + s := string(val) + + dr.idMapMu.Lock() + dr.idMap[s] = id + dr.ridMap[id] = s + dr.idMapMu.Unlock() + return s, nil +} + +func (d *GridKVDriver) AddFieldIndex(label, field string) error { + id, err := d.TableDr.LookupTableID(label) + if err != nil { + return err + } + return d.AddField(id, field) +} + +func (d *GridKVDriver) RemoveFieldIndex(label, field string) error { + id, err := d.TableDr.LookupTableID(label) + if err != nil { + return err + } + return d.RemoveField(id, field) +} + +func (d *GridKVDriver) Close() { + d.Lock.Lock() + defer d.Lock.Unlock() + if d.TableDr != nil { + d.TableDr.Close() + } + if d.closePkv != nil { + _ = d.closePkv() + d.closePkv = nil + } +} + +func (d *GridKVDriver) GetOrLoadTable(name string) (*BackendTable, error) { + // Resolve ID from label name + id, err := d.TableDr.LookupTableID(name) + if err != nil { + // Try case-insensitive lookup if direct lookup fails (optional, based on design) + // But TableDr should handle canonicalization or we accept error. + return nil, fmt.Errorf("table %s not found: %v", name, err) + } + + d.Lock.RLock() + if t, ok := d.Tables[name]; ok { + d.Lock.RUnlock() + return t, nil + } + d.Lock.RUnlock() + + d.Lock.Lock() + defer d.Lock.Unlock() + if t, ok := d.Tables[name]; ok { + return t, nil + } + + store, err := d.TableDr.Get(id) + if err != nil { + return nil, err + } + bt := newBackendTable(name, id, store) + d.Tables[name] = bt + d.TablesByID[id] = bt + return bt, nil +} + +func (d *GridKVDriver) GetTableByID(id uint16) (*BackendTable, error) { + d.Lock.RLock() + t, ok := d.TablesByID[id] + d.Lock.RUnlock() + if ok { + return t, nil + } + + // Try to load if not in memory + info, err := d.TableDr.GetTableInfo(id) + if err != nil { + return nil, err + } + return d.GetOrLoadTable(info.Name) +} + +func (d *GridKVDriver) New(name string, columns []benchtop.ColumnDef) (benchtop.TableStore, error) { + // Check if already exists? GetOrLoad checks. + // We can trust TableDr.New to handle existence or overwriting logic. + + store, err := d.TableDr.New(name, columns) + if err != nil { + return nil, err + } + + // We need ID to store in cache. + id, err := d.TableDr.LookupTableID(name) + if err != nil { + // Should not happen if New succeeded? + return nil, fmt.Errorf("failed to lookup ID after New(%s): %v", name, err) + } + + d.Lock.Lock() + defer d.Lock.Unlock() + + // Check if already in map (race condition?) + if t, ok := d.Tables[name]; ok { + return t, nil + } + + t := newBackendTable(name, id, store) + d.Tables[name] = t + d.TablesByID[id] = t + return t, nil +} + +func (d *GridKVDriver) Get(name string) (benchtop.TableStore, error) { + return d.GetOrLoadTable(name) +} + +func (d *GridKVDriver) List() []string { return d.TableDr.List() } + +func (d *GridKVDriver) AddTableEntryInfo(tx *pebblebulk.PebbleBulk, rowID []byte, rowLoc *benchtop.RowLoc) error { + return tx.Set(benchtop.NewPosKey(rowLoc.TableId, rowID), benchtop.EncodeRowLoc(rowLoc), nil) +} + +func (d *GridKVDriver) BulkLoad(tableID uint16, rows chan *benchtop.Row) error { + return d.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + return d.BulkLoadInternal(tableID, rows, tx) + }) +} + +func (d *GridKVDriver) BulkLoadInternal(targetID uint16, inputs chan *benchtop.Row, tx *pebblebulk.PebbleBulk) error { + var wg sync.WaitGroup + tableChans := make(map[uint16]chan *benchtop.Row) + + // Global tracker for this specific bulk load session + // This prevents duplicates from entering ANY table channel + inFlight := sync.Map{} + + // We also need a snapshot here to check against the existing DB + snap := d.Pkv.Db.NewSnapshot() + defer snap.Close() + + wg.Add(1) + go func() { + defer wg.Done() + for row := range inputs { + if row == nil { + continue + } + + // 1. Check against the DB (Snapshot) + pKey := benchtop.NewPosKey(row.TableID, row.Id) + _, closer, err := snap.Get(pKey) + if err == nil { + closer.Close() + continue // Already in DB + } + + // 2. Check against "In-Flight" memory + idStr := string(row.Id) + if _, loaded := inFlight.LoadOrStore(idStr, struct{}{}); loaded { + continue // Already being processed by a channel + } + + // 3. Dispatch to table channel + id := row.TableID + ch, exists := tableChans[id] + if !exists { + // ... (your existing table info loading logic) + ch = make(chan *benchtop.Row, 1024) + tableChans[id] = ch + wg.Add(1) + go func(id uint16, c chan *benchtop.Row) { + defer wg.Done() + d.TableDr.BulkLoad(id, c) + }(id, ch) + } + ch <- row + } + for _, ch := range tableChans { + close(ch) + } + }() + wg.Wait() + return nil +} + +func (d *GridKVDriver) RowIdsByHas(field string, value any, op query.Condition) chan benchtop.Index { + return d.TableDr.RowIdsByHas(field, value, op) +} + +func (d *GridKVDriver) tableIDsForLabel(label string) []uint16 { + d.Lock.RLock() + var tids []uint16 + for _, t := range d.Tables { + if t.Label == label || t.Name == label { + tids = append(tids, t.TableId) + } + } + d.Lock.RUnlock() + return tids +} + +func (d *GridKVDriver) rowIdsByTableSetFieldValue(tableIDs map[uint16]struct{}, field string, value any, op query.Condition) chan benchtop.Index { + out := make(chan benchtop.Index) + go func() { + defer close(out) + if len(tableIDs) == 0 { + return + } + + emitFallback := func(fallbackIDs []uint16) { + var wg sync.WaitGroup + for _, tableID := range fallbackIDs { + wg.Add(1) + go func(tableID uint16) { + defer wg.Done() + for idx := range d.rowIdsByTableFieldValueLive(tableID, field, value, op) { + out <- idx + } + }(tableID) + } + wg.Wait() + } + + if op == query.EQ { + // Use index only for tables known to have this field indexed. + indexedTables := make(map[uint16]struct{}, len(tableIDs)) + fallbackIDs := make([]uint16, 0, len(tableIDs)) + d.Lock.RLock() + for tableID := range tableIDs { + if tbl, ok := d.TablesByID[tableID]; ok && tbl != nil && tbl.Fields != nil { + if _, indexed := tbl.Fields[field]; indexed { + indexedTables[tableID] = struct{}{} + continue + } + } + fallbackIDs = append(fallbackIDs, tableID) + } + d.Lock.RUnlock() + + usedIndex := false + if len(indexedTables) > 0 { + prefix := benchtop.FieldValueKey(field, value) + if prefix != nil { + usedIndex = true + prefix = append(prefix, benchtop.FieldSep...) + seen := make(map[string]struct{}, 4096) + _ = d.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + _, tableID, _, rowID := benchtop.FieldKeyParse(it.Key()) + if _, ok := indexedTables[tableID]; !ok { + continue + } + + // Deduplicate by (tableID,rowID) in case of stale duplicate index entries. + seenKeyBuf := make([]byte, 2+len(rowID)) + binary.LittleEndian.PutUint16(seenKeyBuf[0:2], tableID) + copy(seenKeyBuf[2:], rowID) + seenKey := string(seenKeyBuf) + if _, ok := seen[seenKey]; ok { + continue + } + + val, err := it.Value() + if err != nil { + continue + } + idxLoc := benchtop.DecodeRowLoc(val) + if idxLoc == nil { + continue + } + + // Validate against live primary index to avoid tombstoned/stale field index keys. + posVal, err := it.Get(benchtop.NewPosKey(tableID, rowID)) + if err != nil { + continue + } + posLoc := benchtop.DecodeRowLoc(posVal) + if posLoc == nil || !sameRowLoc(posLoc, idxLoc) { + continue + } + safeID := make([]byte, len(rowID)) + copy(safeID, rowID) + seen[seenKey] = struct{}{} + out <- benchtop.Index{Key: safeID, Loc: posLoc} + } + return nil + }) + } + } + + if !usedIndex { + for tableID := range indexedTables { + fallbackIDs = append(fallbackIDs, tableID) + } + } + emitFallback(fallbackIDs) + return + } + + // Non-EQ operators: defer to per-table evaluator. + fallbackIDs := make([]uint16, 0, len(tableIDs)) + for tableID := range tableIDs { + fallbackIDs = append(fallbackIDs, tableID) + } + emitFallback(fallbackIDs) + }() + return out +} + +func sameRowLoc(a, b *benchtop.RowLoc) bool { + if a == nil || b == nil { + return false + } + return a.TableId == b.TableId && + a.Section == b.Section && + a.Offset == b.Offset && + a.Size == b.Size && + a.Index == b.Index +} + +func (d *GridKVDriver) rowIdsByTableFieldValueLive(tableID uint16, field string, value any, op query.Condition) chan benchtop.Index { + out := make(chan benchtop.Index, 256) + go func() { + defer close(out) + + t, err := d.GetTableByID(tableID) + if err != nil { + return + } + cond := &bFilters.FieldFilter{ + Operator: op, + Field: field, + Value: value, + } + + const batchSize = 2048 + locs := make([]*benchtop.RowLoc, 0, batchSize) + ids := make([][]byte, 0, batchSize) + + flush := func() { + if len(locs) == 0 { + return + } + rows, errs := t.GetRows(locs) + for i := range rows { + if i >= len(errs) || errs[i] != nil || rows[i] == nil { + continue + } + fieldVal := tpath.PathLookup(rows[i], field) + if !bFilters.ApplyFilterCondition(fieldVal, cond) { + continue + } + safeID := make([]byte, len(ids[i])) + copy(safeID, ids[i]) + out <- benchtop.Index{Key: safeID, Loc: locs[i]} + } + locs = locs[:0] + ids = ids[:0] + } + + for idx := range d.GetIndicesForTable(tableID) { + if idx.Loc == nil || len(idx.Key) == 0 { + continue + } + locs = append(locs, idx.Loc) + ids = append(ids, idx.Key) + if len(locs) >= batchSize { + flush() + } + } + flush() + }() + return out +} + +func (d *GridKVDriver) RowIdsByLabelFieldValue(label, field string, value any, op query.Condition) chan benchtop.Index { + tids := d.tableIDsForLabel(label) + tableSet := make(map[uint16]struct{}, len(tids)) + for _, tableID := range tids { + tableSet[tableID] = struct{}{} + } + return d.rowIdsByTableSetFieldValue(tableSet, field, value, op) +} + +func (d *GridKVDriver) RowIdsByLabelsFieldValue(labels []string, field string, value any, op query.Condition) chan benchtop.Index { + tableSet := make(map[uint16]struct{}, len(labels)) + for _, label := range labels { + for _, tableID := range d.tableIDsForLabel(label) { + tableSet[tableID] = struct{}{} + } + } + return d.rowIdsByTableSetFieldValue(tableSet, field, value, op) +} + +func (d *GridKVDriver) GetLabels(edges bool, removePrefix bool) chan string { + return d.TableDr.GetLabels(edges, removePrefix) +} + +func (d *GridKVDriver) InvalidateLoc(tableID uint16, rowID string) { + d.TableDr.InvalidateLoc(tableID, rowID) +} + +func (d *GridKVDriver) GetIDsForLabel(label string) chan string { + tids := d.tableIDsForLabel(label) + + out := make(chan string) + go func() { + defer close(out) + var wg sync.WaitGroup + for _, tid := range tids { + wg.Add(1) + go func(tid uint16) { + defer wg.Done() + for id := range d.GetIDsForTable(tid) { + out <- id + } + }(tid) + } + wg.Wait() + }() + return out +} + +func (d *GridKVDriver) AddField(tableID uint16, field string) error { + d.Lock.Lock() + if t, ok := d.TablesByID[tableID]; ok { + if t.Fields == nil { + t.Fields = make(map[string]struct{}) + } + t.Fields[field] = struct{}{} + } + d.Lock.Unlock() + + forwardKey, reversePrefix := indexer.PresenceKeys(tableID, field) + if err := d.Pkv.Set(forwardKey, []byte{}, nil); err != nil { + return err + } + if err := d.Pkv.Set(reversePrefix, []byte{}, nil); err != nil { + return err + } + + store, err := d.TableDr.Get(tableID) + if err != nil { + return err + } + return d.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + for r := range store.ScanFull(nil) { + fieldValue := tpath.PathLookup(r.DataMap, field) + rowID, ok := r.DataMap["_id"].(string) + if !ok { + continue + } + if fieldValue != nil { + for _, e := range indexer.BuildFieldIndexEntries(tableID, field, rowID, fieldValue, r.Loc) { + if err := tx.Set(e.Key, e.Value, nil); err != nil { + return err + } + } + } + } + return nil + }) +} + +func (d *GridKVDriver) RemoveField(tableID uint16, field string) error { + d.Lock.Lock() + if t, ok := d.TablesByID[tableID]; ok && t.Fields != nil { + delete(t.Fields, field) + } + d.Lock.Unlock() + + fieldPrefix := benchtop.FieldLabelKey(field, tableID) + _, revPrefix := indexer.PresenceKeys(tableID, field) + return d.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + if err := tx.DeletePrefix(fieldPrefix); err != nil { + return err + } + if err := tx.DeletePrefix(revPrefix); err != nil { + return err + } + return nil + }) +} + +func (d *GridKVDriver) DeleteRowField(tableID uint16, field, rowID string) error { + // Deletes a singular row index field + + // Get the field value from the reverse index + rowIndexKey := benchtop.RFieldKey(tableID, field, rowID) + var fieldValueBytes []byte + err := d.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + val, err := it.Get(rowIndexKey) + if err != nil { + return err + } + fieldValueBytes = make([]byte, len(val)) + copy(fieldValueBytes, val) + return nil + }) + + if err != nil { + if errors.Is(err, pebble.ErrNotFound) { + return nil // Already deleted or doesn't exist + } + return err + } + + var fieldValue any + if len(fieldValueBytes) > 0 { + decoded, err := indexer.DecodeReverseFieldValue(fieldValueBytes) + if err != nil { + return err + } + fieldValue = decoded + } + + // Delete both the forward and reverse index entries + return d.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + if err := tx.Delete(benchtop.FieldKey(field, tableID, fieldValue, []byte(rowID)), nil); err != nil { + return err + } + if err := tx.Delete(rowIndexKey, nil); err != nil { + return err + } + return nil + }) +} + +func (d *GridKVDriver) GetIDsForTable(tableID uint16) chan string { + out := make(chan string, 256) + go func() { + defer close(out) + + prefix := benchtop.NewPosKeyPrefix(tableID) + err := d.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + _, rowID := benchtop.ParsePosKey(it.Key()) + if len(rowID) == 0 { + continue + } + out <- string(rowID) + } + return nil + }) + if err == nil { + return + } + + // Fallback to full table scan if pos index scan fails. + store, storeErr := d.TableDr.Get(tableID) + if storeErr != nil { + log.Warningf("GetIDsForTable failed tableID=%d posErr=%v storeErr=%v", tableID, err, storeErr) + return + } + for rowID := range store.ScanId(nil) { + out <- rowID + } + }() + return out +} + +func (d *GridKVDriver) GetIndicesForTable(tableID uint16) chan benchtop.Index { + out := make(chan benchtop.Index, 256) + go func() { + defer close(out) + prefix := benchtop.NewPosKeyPrefix(tableID) + _ = d.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + _, rowID := benchtop.ParsePosKey(it.Key()) + if len(rowID) == 0 { + continue + } + val, err := it.Value() + if err != nil { + continue + } + loc := benchtop.DecodeRowLoc(val) + if loc == nil { + continue + } + safeID := make([]byte, len(rowID)) + copy(safeID, rowID) + out <- benchtop.Index{Key: safeID, Loc: loc} + } + return nil + }) + }() + return out +} + +func (d *GridKVDriver) RowLocsByLabel(label string) chan benchtop.Index { + tids := d.tableIDsForLabel(label) + out := make(chan benchtop.Index) + go func() { + defer close(out) + var wg sync.WaitGroup + for _, tid := range tids { + wg.Add(1) + go func(tid uint16) { + defer wg.Done() + for idx := range d.GetIndicesForTable(tid) { + out <- idx + } + }(tid) + } + wg.Wait() + }() + return out +} + +func (d *GridKVDriver) LoadFields() error { + fPrefix := benchtop.FieldPrefix + return d.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(fPrefix); it.Valid() && bytes.HasPrefix(it.Key(), fPrefix); it.Next() { + field, tableID, _, _ := benchtop.FieldKeyParse(it.Key()) + info, err := d.TableDr.GetTableInfo(tableID) + if err != nil { + continue + } + table, err := d.GetOrLoadTable(info.Name) + if err != nil { + continue + } + if table.Fields == nil { + table.Fields = map[string]struct{}{} + } + table.Fields[field] = struct{}{} + } + return nil + }) +} + +func (d *GridKVDriver) ValuesWithin(v any) []any { return util.SliceToAny(v) } + +func (d *GridKVDriver) BulkLoadBatch(tx *pebblebulk.PebbleBulk, entries []*benchtop.Row, snap *pebble.Snapshot) error { + if len(entries) == 0 { + return nil + } + + var it *pebble.Iterator + if snap != nil { + it, _ = snap.NewIter(nil) + defer it.Close() + } + + // 1. Group rows by TableID + byTable := make(map[uint16][]*benchtop.Row) + for _, row := range entries { + byTable[row.TableID] = append(byTable[row.TableID], row) + } + + for tid, rows := range byTable { + t, err := d.GetTableByID(tid) + if err != nil { + return err + } + + // Sort rows by ID to maximize iterator spatial locality during existence check + sort.Slice(rows, func(i, j int) bool { + return bytes.Compare(rows[i].Id, rows[j].Id) < 0 + }) + + // 2. Process all entries (uniqueness already verified by caller) + filteredRows := rows + + // Prepare raw rows for the table driver (JSONTable.AddRows uses []Row) + rawRows := make([]benchtop.Row, len(filteredRows)) + for i, r := range filteredRows { + rawRows[i] = *r + } + + // 3. Bulk add rows to the table storage (e.g. JSON/Pebble/Arrow) + locs, err := t.AddRows(rawRows) + if err != nil { + return err + } + + if len(locs) != len(filteredRows) { + return fmt.Errorf("BulkLoadBatch: AddRows returned %d locs for %d rows", len(locs), len(filteredRows)) + } + + // 4. Process each row's index and metadata updates + for i, row := range filteredRows { + rowLoc := locs[i] + idStr := string(row.Id) + + // Update the structural keys (Integrated Keys) + // Check if it's a vertex or edge based on table name prefix + uid, _ := d.GetID(idStr) + if strings.HasPrefix(t.Name, key.VertexTablePrefix) { + vkey := key.VertexKey(uid) + // We need the label. BackendTable has it. + val := benchtop.EncodeVertexValue(t.Label, rowLoc) + if err := tx.Set(vkey, val, nil); err != nil { + return err + } + } else if strings.HasPrefix(t.Name, key.EdgeTablePrefix) { + // For edges, we might need to update multi-keys. + // This is a bit complex in driver if we don't have the From/To. + // But we can check if data has them (BulkAdd puts them there). + fromStr, fOk := row.Data["_from"].(string) + toStr, tOk := row.Data["_to"].(string) + if fOk && tOk { + fuid, _ := d.GetID(fromStr) + tuid, _ := d.GetID(toStr) + val := benchtop.EncodeEdgeValue(t.Label, rowLoc, row.Data) + ekey := key.EdgeKey(uid, fuid, tuid, t.Label) + if err := tx.Set(ekey, val, nil); err != nil { + return err + } + if err := tx.Set(key.SrcEdgeKey(uid, fuid, tuid, t.Label), val, nil); err != nil { + return err + } + if err := tx.Set(key.DstEdgeKey(uid, fuid, tuid, t.Label), val, nil); err != nil { + return err + } + } + } + + // Primary Index partitioned by TableID (useful for scans) + if err := tx.Set(benchtop.NewPosKey(tid, row.Id), benchtop.EncodeRowLoc(rowLoc), nil); err != nil { + return err + } + + // Secondary Index building + if len(t.Fields) > 0 { + for field := range t.Fields { + if val := tpath.PathLookup(row.Data, field); val != nil { + for _, e := range indexer.BuildFieldIndexEntries(tid, field, idStr, val, rowLoc) { + if err := tx.Set(e.Key, e.Value, nil); err != nil { + return err + } + } + } + } + } + } + } + return nil +} + +func (d *GridKVDriver) ListFields() []FieldInfo { + var out []FieldInfo + // Explicitly load all tables to ensure we have their field info + for _, name := range d.TableDr.List() { + if _, err := d.GetOrLoadTable(name); err != nil { + continue + } + } + + d.Lock.RLock() + defer d.Lock.RUnlock() + + for _, table := range d.Tables { + if table == nil { + continue + } + info, err := d.TableDr.GetTableInfo(table.TableId) + if err != nil { + continue + } + for field := range table.Fields { + out = append(out, FieldInfo{Label: info.Name, Field: field}) + } + } + return out +} + +func (d *GridKVDriver) GetLocBatch(ctx context.Context, ids []string) (map[string]*IDInfo, error) { + out := make(map[string]*IDInfo, len(ids)) + uids, err := d.GetIDs(ids) + if err != nil { + return nil, err + } + + for i, id := range ids { + uid := uids[i] + // 1. Check Vertex + vkey := key.VertexKey(uid) + val, closer, err := d.Pkv.Get(vkey) + if err == nil { + vlbl, loc := benchtop.DecodeVertexValue(val) + closer.Close() + if loc != nil { + out[id] = &IDInfo{Label: vlbl, Loc: loc} + continue + } + } + + // 2. Check Edges + ekeyPrefix := key.EdgeKeyPrefix(uid) + var eloc *benchtop.RowLoc + var elbl string + var edata map[string]any + _ = d.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(ekeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), ekeyPrefix); it.Next() { + byteVal, _ := it.Value() + var lbl string + var loc *benchtop.RowLoc + var data map[string]any + lbl, loc, data = benchtop.DecodeEdgeValue(byteVal) + if loc != nil { + eloc = loc + elbl = lbl + edata = data + return nil + } + } + return nil + }) + if eloc != nil { + out[id] = &IDInfo{Label: elbl, Loc: eloc, Data: edata} + } + } + return out, nil +} + +// GetVertexLocBatch resolves vertex RowLocs only (no edge scan fallback). +// This is significantly faster for V() traversal hydration paths. +func (d *GridKVDriver) GetVertexLocBatch(ctx context.Context, ids []string) (map[string]*IDInfo, error) { + out := make(map[string]*IDInfo, len(ids)) + uids, err := d.GetIDs(ids) + if err != nil { + return nil, err + } + for i, id := range ids { + select { + case <-ctx.Done(): + return out, ctx.Err() + default: + } + uid := uids[i] + vkey := key.VertexKey(uid) + val, closer, err := d.Pkv.Get(vkey) + if err != nil { + continue + } + vlbl, loc := benchtop.DecodeVertexValue(val) + closer.Close() + if loc != nil { + out[id] = &IDInfo{Label: vlbl, Loc: loc} + } + } + return out, nil +} + +// GetVertexLocByUIDBatch resolves vertex RowLocs by numeric UID using one iterator +// pass in UID-sorted order to minimize random seeks. +func (d *GridKVDriver) GetVertexLocByUIDBatch(ctx context.Context, uids []uint64) (map[uint64]*IDInfo, error) { + out := make(map[uint64]*IDInfo, len(uids)) + if len(uids) == 0 { + return out, nil + } + dedup := make(map[uint64]struct{}, len(uids)) + uniq := make([]uint64, 0, len(uids)) + for _, uid := range uids { + if uid == 0 { + continue + } + if _, ok := dedup[uid]; ok { + continue + } + dedup[uid] = struct{}{} + uniq = append(uniq, uid) + } + sort.Slice(uniq, func(i, j int) bool { return uniq[i] < uniq[j] }) + + err := d.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for _, uid := range uniq { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + vkey := key.VertexKey(uid) + if err := it.Seek(vkey); err != nil { + continue + } + if !it.Valid() || !bytes.Equal(it.Key(), vkey) { + continue + } + val, err := it.Value() + if err != nil { + continue + } + vlbl, loc := benchtop.DecodeVertexValue(val) + if loc != nil { + out[uid] = &IDInfo{Label: vlbl, Loc: loc} + } + } + return nil + }) + return out, err +} diff --git a/grids/driver/indexer/indexer.go b/grids/driver/indexer/indexer.go new file mode 100644 index 00000000..ba7040db --- /dev/null +++ b/grids/driver/indexer/indexer.go @@ -0,0 +1,51 @@ +package indexer + +import ( + "bytes" + "encoding/binary" + + "github.com/bmeg/benchtop" + "github.com/bytedance/sonic" +) + +type Entry struct { + Key []byte + Value []byte +} + +func PresenceKeys(tableID uint16, field string) (forwardKey []byte, reversePrefix []byte) { + forwardKey = benchtop.FieldKey(field, tableID, nil, nil) + idBytes := make([]byte, 2) + binary.LittleEndian.PutUint16(idBytes, tableID) + reversePrefix = bytes.Join([][]byte{benchtop.RFieldPrefix, idBytes, []byte(field)}, benchtop.FieldSep) + return forwardKey, reversePrefix +} + +func BuildFieldIndexEntries(tableID uint16, field, rowID string, fieldValue any, rowLoc *benchtop.RowLoc) []Entry { + entries := []Entry{ + { + Key: benchtop.FieldKey(field, tableID, fieldValue, []byte(rowID)), + Value: benchtop.EncodeRowLoc(rowLoc), + }, + } + bVal, err := sonic.ConfigFastest.Marshal(fieldValue) + if err != nil { + return entries + } + entries = append(entries, Entry{ + Key: benchtop.RFieldKey(tableID, field, rowID), + Value: bVal, + }) + return entries +} + +func DecodeReverseFieldValue(v []byte) (any, error) { + var out any + if len(v) == 0 { + return nil, nil + } + if err := sonic.ConfigFastest.Unmarshal(v, &out); err != nil { + return nil, err + } + return out, nil +} diff --git a/grids/filter/condition.go b/grids/filter/condition.go new file mode 100644 index 00000000..e7b094d2 --- /dev/null +++ b/grids/filter/condition.go @@ -0,0 +1,38 @@ +package filter + +import ( + "github.com/bmeg/benchtop/query" + "github.com/bmeg/grip/gripql" +) + +// ToQueryCondition converts a GripQL condition to a Bencthop query condition +func ToQueryCondition(c gripql.Condition) query.Condition { + switch c { + case gripql.Condition_EQ: + return query.EQ + case gripql.Condition_NEQ: + return query.NEQ + case gripql.Condition_GT: + return query.GT + case gripql.Condition_GTE: + return query.GTE + case gripql.Condition_LT: + return query.LT + case gripql.Condition_LTE: + return query.LTE + case gripql.Condition_INSIDE: + return query.INSIDE + case gripql.Condition_OUTSIDE: + return query.OUTSIDE + case gripql.Condition_BETWEEN: + return query.BETWEEN + case gripql.Condition_WITHIN: + return query.WITHIN + case gripql.Condition_WITHOUT: + return query.WITHOUT + case gripql.Condition_CONTAINS: + return query.CONTAINS + default: + return query.EQ + } +} diff --git a/grids/filters.go b/grids/filter/filter.go similarity index 52% rename from grids/filters.go rename to grids/filter/filter.go index b9c4bcb7..1c5f65e4 100644 --- a/grids/filters.go +++ b/grids/filter/filter.go @@ -1,6 +1,9 @@ -package grids +package filter import ( + "strconv" + "strings" + bFilters "github.com/bmeg/benchtop/filters" "github.com/bmeg/benchtop/jsontable/table" "github.com/bmeg/grip/gripql" @@ -16,8 +19,8 @@ type GripQLFilter struct { func (f *GripQLFilter) GetFilter() any { return f.Expression } + func (f *GripQLFilter) IsNoOp() bool { - // A GripQLFilter is a no-op if its Expression is nil return f.Expression == nil } @@ -62,52 +65,127 @@ func extractKeys(expr *gripql.HasExpression) []string { return out } +func parseDirectPath(path string) ([]any, bool) { + path = strings.TrimSpace(path) + path = strings.TrimPrefix(path, "$") + path = strings.TrimPrefix(path, ".") + if path == "" { + return nil, false + } + + parts := []any{} + var token strings.Builder + flushToken := func() { + if token.Len() > 0 { + parts = append(parts, token.String()) + token.Reset() + } + } + + for i := 0; i < len(path); i++ { + ch := path[i] + switch ch { + case '.': + flushToken() + case '[': + flushToken() + j := i + 1 + for j < len(path) && path[j] != ']' { + j++ + } + if j >= len(path) || j == i+1 { + return nil, false + } + idx, err := strconv.Atoi(path[i+1 : j]) + if err != nil { + return nil, false + } + parts = append(parts, idx) + i = j + default: + token.WriteByte(ch) + } + } + flushToken() + + if len(parts) == 0 { + return nil, false + } + return parts, true +} + +func sonicLookup(row []byte, condKey string) any { + if path, ok := parseDirectPath(condKey); ok { + node, err := sonic.Get(row, path...) + if err == nil { + v, ierr := node.Interface() + if ierr == nil { + return v + } + } + } + + // Legacy packed-row fallback used by older json table code paths. + pathArr, err := table.ConvertJSONPathToArray(condKey) + if err != nil { + return nil + } + node, err := sonic.Get(row, pathArr...) + if err != nil { + if err != ast.ErrNotExist { + log.Debugf("Sonic fetch error for path %v: %v", pathArr, err) + } + return nil + } + v, ierr := node.Interface() + if ierr != nil { + return nil + } + return v +} + +func tableLabel(tableName string) string { + if len(tableName) > 2 && (strings.HasPrefix(tableName, "v_") || strings.HasPrefix(tableName, "e_")) { + return tableName[2:] + } + return tableName +} + func MatchesHasExpression(row []byte, stmt *gripql.HasExpression, tableName string) bool { + if stmt == nil || stmt.Expression == nil { + return true + } + switch stmt.Expression.(type) { case *gripql.HasExpression_Condition: cond := stmt.GetCondition() var lookupVal any - if cond.Key == "_label" { - lookupVal = tableName[2:] - } else if cond.Key == "_id" { - node, err := sonic.Get(row, []any{"1"}...) - if err != nil { - if err != ast.ErrNotExist { - log.Errorf("Sonic Fetch err for path 1 on doc %#v: %v", string(row), err) - } - return false - } - lookupVal, err = node.Interface() - if err != nil { - log.Errorf("Error unmarshaling node: %v", err) - return false - } - } else { - pathArr, err := table.ConvertJSONPathToArray(cond.Key) - if err != nil { - log.Errorf("Error converting JSON path: %v", err) - return false - } - node, err := sonic.Get(row, pathArr...) - if err != nil { - if err != ast.ErrNotExist { - log.Errorf("Sonic Fetch err for path: %s on doc %#v: %v", pathArr, string(row), err) - return false - } - lookupVal = nil - } else { + + switch cond.Key { + case "_label": + lookupVal = tableLabel(tableName) + case "_id": + node, err := sonic.Get(row, "_id") + if err == nil { lookupVal, err = node.Interface() if err != nil { - log.Errorf("Error unmarshaling node: %v", err) - return false + lookupVal = nil + } + } else { + // Legacy packed-row fallback + node, err = sonic.Get(row, []any{"1"}...) + if err == nil { + lookupVal, _ = node.Interface() } } + default: + lookupVal = sonicLookup(row, cond.Key) } return bFilters.ApplyFilterCondition( lookupVal, &bFilters.FieldFilter{ - Operator: cond.Condition, + Operator: ToQueryCondition(cond.Condition), Field: cond.Key, Value: cond.Value.AsInterface(), }, diff --git a/grids/graph.go b/grids/graph.go index f8fc6bbb..a2fa5d2a 100644 --- a/grids/graph.go +++ b/grids/graph.go @@ -1,1464 +1,24 @@ package grids import ( - "bytes" - "context" - "fmt" - "runtime" - "slices" - "sort" - "sync" - "sync/atomic" + "strings" - "github.com/bmeg/benchtop" - jTable "github.com/bmeg/benchtop/jsontable/table" - "github.com/bmeg/benchtop/jsontable/tpath" - "github.com/bmeg/benchtop/pebblebulk" "github.com/bmeg/grip/engine/core" "github.com/bmeg/grip/gdbi" - "github.com/bmeg/grip/log" - "github.com/bmeg/grip/util/setcmp" - "github.com/bytedance/sonic" - "github.com/cockroachdb/pebble" - multierror "github.com/hashicorp/go-multierror" ) -const ( - VTABLE_PREFIX = "v_" - ETABLE_PREFIX = "e_" -) +func labelFromElementID(id string) string { + if i := strings.IndexByte(id, ':'); i > 0 { + return id[:i] + } + return "" +} // GetTimestamp returns the update timestamp func (ggraph *Graph) GetTimestamp() string { return ggraph.ts.Get(ggraph.graphID) } -func insertVertex(tx *pebblebulk.PebbleBulk, vertex *gdbi.Vertex) error { - if vertex.ID == "" { - return fmt.Errorf("inserting null key vertex") - } - if err := tx.Set(VertexKey(vertex.ID), []byte(vertex.Label), nil); err != nil { - return fmt.Errorf("AddVertex Error %s", err) - } - return nil -} - -func (ggraph *Graph) indexVertex(vertex *gdbi.Vertex, tx *pebblebulk.PebbleBulk) error { - vertexLabel := VTABLE_PREFIX + vertex.Label - ggraph.jsonkv.Lock.Lock() - table, ok := ggraph.jsonkv.Tables[vertexLabel] - ggraph.jsonkv.Lock.Unlock() - if !ok { - log.Debugf("Creating new table %s for label %s on graph %s", vertexLabel, vertex.Label, ggraph.graphID) - newTable, err := ggraph.jsonkv.New(vertexLabel, nil) - if err != nil { - return fmt.Errorf("indexVertex: %s", err) - } - ggraph.jsonkv.Lock.Lock() - table = newTable.(*jTable.JSONTable) - ggraph.jsonkv.Tables[vertexLabel] = table - ggraph.jsonkv.Lock.Unlock() - } - - rowLoc, err := table.AddRow( - benchtop.Row{ - Id: []byte(vertex.ID), - TableName: vertexLabel, - Data: vertex.Data, - }, - ) - if err != nil { - return fmt.Errorf("AddVertex Error %s", err) - } - - err = ggraph.jsonkv.AddTableEntryInfo(tx, []byte(vertex.ID), rowLoc) - if err != nil { - return fmt.Errorf("AddVertex Error %s", err) - } - - _, ok = ggraph.jsonkv.LocCache.Set(vertex.ID, rowLoc) - if !ok { - ggraph.jsonkv.LocCache.Invalidate(vertex.ID) - ggraph.jsonkv.LocCache.Set(vertex.ID, rowLoc) - } - - table, tableExists := ggraph.jsonkv.Tables[vertexLabel] - if tableExists && len(table.Fields) > 0 { - for field := range ggraph.jsonkv.Tables[vertexLabel].Fields { - if val := tpath.PathLookup(vertex.Data, field); val != nil { - err := tx.Set(benchtop.FieldKey(field, vertexLabel, val, []byte(vertex.ID)), []byte{}, nil) - if err != nil { - return err - } - Mval, err := sonic.ConfigFastest.Marshal(val) - if err != nil { - return err - } - err = tx.Set(benchtop.RFieldKey(vertexLabel, field, vertex.ID), Mval, nil) - if err != nil { - return err - } - } - } - } - - return nil -} - -func insertEdge(tx *pebblebulk.PebbleBulk, edge *gdbi.Edge) error { - if edge.ID == "" || - edge.From == "" || - edge.To == "" || - edge.Label == "" { - log.Errorln("insertEdge Err: ", edge) - return fmt.Errorf("inserting null key edge") - } - err := tx.Set(EdgeKey(edge.ID, edge.From, edge.To, edge.Label), nil, nil) - if err != nil { - return err - } - err = tx.Set(DstEdgeKey( - edge.ID, - edge.From, - edge.To, - edge.Label, - ), []byte{}, nil) - if err != nil { - return err - } - err = tx.Set(SrcEdgeKey( - edge.ID, - edge.From, - edge.To, - edge.Label, - ), []byte{}, nil) - if err != nil { - return err - } - return nil -} - -func (ggraph *Graph) indexEdge(edge *gdbi.Edge, tx *pebblebulk.PebbleBulk) error { - edgeLabel := ETABLE_PREFIX + edge.Label - ggraph.jsonkv.Lock.Lock() - table, ok := ggraph.jsonkv.Tables[edgeLabel] - ggraph.jsonkv.Lock.Unlock() - - if !ok { - log.Debugf("Creating new table %s for label %s on graph %s", edgeLabel, edge.Label, ggraph.graphID) - newTable, err := ggraph.jsonkv.New(edgeLabel, nil) - if err != nil { - return fmt.Errorf("indexEdge: jsonkv.New: %s", err) - } - ggraph.jsonkv.Lock.Lock() - table = newTable.(*jTable.JSONTable) - ggraph.jsonkv.Tables[edgeLabel] = table - ggraph.jsonkv.Lock.Unlock() - } - rowLoc, err := table.AddRow(benchtop.Row{Id: []byte(edge.ID), TableName: edgeLabel, Data: edge.Data}) - if err != nil { - return fmt.Errorf("indexEdge: table.AddRow: %s", err) - } - err = ggraph.jsonkv.AddTableEntryInfo(tx, []byte(edge.ID), rowLoc) - if err != nil { - return fmt.Errorf("indexEdge: jsonkv.AddTableEntryInfo: %s", err) - - } - - _, ok = ggraph.jsonkv.LocCache.Set(edge.ID, rowLoc) - if !ok { - ggraph.jsonkv.LocCache.Invalidate(edge.ID) - ggraph.jsonkv.LocCache.Set(edge.ID, rowLoc) - } - - table, tableExists := ggraph.jsonkv.Tables[edgeLabel] - if tableExists && len(table.Fields) > 0 { - for field := range table.Fields { - if val := tpath.PathLookup(edge.Data, field); val != nil { - err := tx.Set(benchtop.FieldKey(field, edgeLabel, val, []byte(edge.ID)), []byte{}, nil) - if err != nil { - return err - } - eMarsh, err := sonic.ConfigFastest.Marshal(val) - if err != nil { - return err - } - err = tx.Set(benchtop.RFieldKey(edgeLabel, field, edge.ID), eMarsh, nil) - if err != nil { - return err - } - } - } - } - return nil -} - func (ggraph *Graph) Compiler() gdbi.Compiler { return core.NewCompiler(ggraph, GridsOptimizer, core.IndexStartOptimize) } - -// AddVertex adds an edge to the graph, if it already exists -// in the graph, it is replaced -func (ggraph *Graph) AddVertex(vertices []*gdbi.Vertex) error { - err := ggraph.jsonkv.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - var bulkErr *multierror.Error - for _, vert := range vertices { - if err := insertVertex(tx, vert); err != nil { - bulkErr = multierror.Append(bulkErr, err) - log.Errorf("AddVertex Error %s", err) - } - } - ggraph.ts.Touch(ggraph.graphID) - return bulkErr.ErrorOrNil() - }) - - err = ggraph.jsonkv.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - var bulkErr *multierror.Error - for _, vert := range vertices { - if err := ggraph.indexVertex(vert, tx); err != nil { - bulkErr = multierror.Append(bulkErr, err) - log.Errorf("IndexVertex Error %s", err) - } - } - ggraph.ts.Touch(ggraph.graphID) - return bulkErr.ErrorOrNil() - }) - return err -} - -// AddEdge adds an edge to the graph, if the id is not "" and in already exists -// in the graph, it is replaced -func (ggraph *Graph) AddEdge(edges []*gdbi.Edge) error { - var err error = nil - err = ggraph.jsonkv.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - err = ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for _, edge := range edges { - err = insertEdge(tx, edge) - if err != nil { - log.Errorln("Err insertEdge: ", err) - return err - } - } - return err - }) - ggraph.ts.Touch(ggraph.graphID) - return err - }) - if err != nil { - return err - } - err = ggraph.jsonkv.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - var bulkErr *multierror.Error - for _, edge := range edges { - if err := ggraph.indexEdge(edge, tx); err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - } - ggraph.ts.Touch(ggraph.graphID) - return bulkErr.ErrorOrNil() - }) - return err - -} - -func (ggraph *Graph) BulkAdd(stream <-chan *gdbi.GraphElement) error { - var errs *multierror.Error - insertStream := make(chan *gdbi.GraphElement, 100) - indexStream := make(chan *benchtop.Row, 100) - errChan := make(chan error, 2) - - var wg sync.WaitGroup - wg.Add(2) - - go func() { - defer wg.Done() - err := ggraph.jsonkv.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - for elem := range insertStream { - if elem.Vertex != nil { - if err := insertVertex(tx, elem.Vertex); err != nil { - return fmt.Errorf("vertex insert error: %v", err) - } - } - if elem.Edge != nil { - if err := insertEdge(tx, elem.Edge); err != nil { - return fmt.Errorf("edge insert error: %v", err) - } - } - } - return nil - }) - if err != nil { - log.Errorf("ERR in graph Bulk Add: %s", err) - return - } - - }() - - go func() { - defer wg.Done() - err := ggraph.jsonkv.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - if err := ggraph.jsonkv.BulkLoad(indexStream, tx); err != nil { - return fmt.Errorf("jsonkv bulk load error: %v", err) - } - ggraph.ts.Touch(ggraph.graphID) - return nil - }) - errChan <- err - }() - - go func() { - defer func() { - close(insertStream) - close(indexStream) - }() - for elem := range stream { - insertStream <- elem - if elem.Vertex != nil { - indexStream <- &benchtop.Row{ - Id: []byte(elem.Vertex.ID), - TableName: VTABLE_PREFIX + elem.Vertex.Label, - Data: elem.Vertex.Data, - } - } - if elem.Edge != nil { - indexStream <- &benchtop.Row{ - Id: []byte(elem.Edge.ID), - TableName: ETABLE_PREFIX + elem.Edge.Label, - Data: elem.Edge.Data, - } - } - } - }() - - wg.Wait() - close(errChan) - - for err := range errChan { - if err != nil { - errs = multierror.Append(errs, err) - } - } - - return errs.ErrorOrNil() -} - -func (ggraph *Graph) DelVertex(id string) error { - vid := VertexKey(id) - skeyPrefix := SrcEdgePrefix(id) - dkeyPrefix := DstEdgePrefix(id) - - delKeys := make([][]byte, 0, 1000) - edgesToDelete := make(map[string]string) - - var bulkErr *multierror.Error - err := ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for it.Seek(skeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), skeyPrefix); it.Next() { - skey := it.Key() - eid, sid, did, label := SrcEdgeKeyParse(skey) - - if ggraph.tempDeletedEdges != nil { - if _, exists := ggraph.tempDeletedEdges[eid]; exists { - continue - } - } - if _, exists := edgesToDelete[eid]; exists { - continue - } - - ekey := EdgeKey(eid, sid, did, label) - dkey := DstEdgeKey(eid, sid, did, label) - delKeys = append(delKeys, ekey, skey, dkey) - edgesToDelete[eid] = label - } - - for it.Seek(dkeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), dkeyPrefix); it.Next() { - dkey := it.Key() - eid, sid, did, label := DstEdgeKeyParse(dkey) - - if ggraph.tempDeletedEdges != nil { - if _, exists := ggraph.tempDeletedEdges[eid]; exists { - continue - } - } - if _, exists := edgesToDelete[eid]; exists { - continue - } - - ekey := EdgeKey(eid, sid, did, label) - skey := SrcEdgeKey(eid, sid, did, label) - delKeys = append(delKeys, ekey, skey, dkey) - edgesToDelete[eid] = label - } - return nil - }) - - if err != nil { - return err - } - - for eid, label := range edgesToDelete { - if err := ggraph.DeleteAnyRow(eid, label, true); err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - - if ggraph.tempDeletedEdges != nil { - ggraph.tempDeletedEdges[eid] = struct{}{} - } - } - - loc, err := ggraph.jsonkv.LocCache.Get(context.Background(), id) - if err != nil { - return err - } - - label := ggraph.jsonkv.LabelLookup[loc.TableId] - if label == "" { - bulkErr = multierror.Append(bulkErr, fmt.Errorf("Failed to lookup table label %d", loc.TableId)) - } - if err := ggraph.DeleteAnyRow(id, label, false); err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - - err = ggraph.jsonkv.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - if err := tx.DeletePrefix(vid); err != nil { - return err - } - for _, k := range delKeys { - if err := tx.DeletePrefix(k); err != nil { - log.Errorf("BulkWrite failed to delete key %s: %v", string(k), err) - return err - } - } - ggraph.ts.Touch(ggraph.graphID) - return nil - }) - if err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - - return bulkErr.ErrorOrNil() -} - -func (ggraph *Graph) DelEdge(eid string) error { - ekeyPrefix := EdgeKeyPrefix(eid) - var ekey []byte - err := ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for it.Seek(ekeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), ekeyPrefix); it.Next() { - ekey = it.Key() - } - return nil - }) - if err != nil { - return err - } - - if ekey == nil { - log.Debugf("Edge %s not found", eid) - return nil - } - - _, sid, did, lbl := EdgeKeyParse(ekey) - skey := SrcEdgeKey(sid, did, eid, lbl) - dkey := DstEdgeKey(sid, did, eid, lbl) - - var bulkErr *multierror.Error - err = ggraph.jsonkv.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - if err := tx.Delete(ekey, nil); err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - if err := tx.Delete(skey, nil); err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - if err := tx.Delete(dkey, nil); err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - ggraph.ts.Touch(ggraph.graphID) - return nil - }) - - if err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - - if err := ggraph.DeleteAnyRow(eid, lbl, true); err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - - return bulkErr.ErrorOrNil() -} - -// GetVertex loads a vertex given an id. It returns a nil if not found -func (ggraph *Graph) GetVertex(id string, loadProp bool) *gdbi.Vertex { - ekeyPrefix := VertexKey(id) - var byteLabel []byte = nil - var err error = nil - err = ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for it.Seek(ekeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), ekeyPrefix); it.Next() { - byteLabel, err = it.Value() - } - return nil - }) - if err != nil || byteLabel == nil { - return nil - } - - v := &gdbi.Vertex{ - ID: id, - Label: string(byteLabel), - } - if loadProp { - entry, err := ggraph.jsonkv.LocCache.Get(context.Background(), id) - if err != nil { - log.Errorf("GetVertex: PageCache.Get( error: %v", err) - return nil - } - v.Data, err = ggraph.jsonkv.Tables[VTABLE_PREFIX+v.Label].GetRow(entry) - if err != nil { - log.Errorf("GetVertex: table.GetRow( error: %v", err) - return nil - } - v.Loaded = true - } else { - v.Data = map[string]any{} - } - return v -} - -type elementData struct { - label string - req gdbi.ElementLookup - data []byte -} - -type idEntry struct { - lookup gdbi.ElementLookup - loc *benchtop.RowLoc -} - -func (ggraph *Graph) GetVertexChannel(ctx context.Context, ids chan gdbi.ElementLookup, load bool) chan gdbi.ElementLookup { - out := make(chan gdbi.ElementLookup, 100) - go func() { - defer close(out) - if !load { - for id := range ids { - if id.IsSignal() { - out <- id - } else { - id.Vertex = &gdbi.Vertex{ID: id.ID} - out <- id - } - } - return - } - var batch []idEntry - for id := range ids { - if id.IsSignal() { - out <- id - continue - } - entry, err := ggraph.jsonkv.LocCache.Get(ctx, id.ID) - if err != nil { - log.Errorf("GetVertexChannel: PageCache.Get error: %v", err) - continue - } - batch = append(batch, idEntry{lookup: id, loc: entry}) - if len(batch) >= 1000 { - processBatchWithLabelCache(ggraph, batch, out) - batch = nil - } - } - if len(batch) > 0 { - processBatchWithLabelCache(ggraph, batch, out) - } - }() - return out -} - -type groupKey struct { - TableId uint16 - Section uint16 -} - -func processBatchWithLabelCache(ggraph *Graph, batch []idEntry, out chan gdbi.ElementLookup) { - var wg sync.WaitGroup - sem := make(chan struct{}, 10) - byKey := make(map[groupKey][]idEntry) - for _, entry := range batch { - key := groupKey{TableId: entry.loc.TableId, Section: entry.loc.Section} - byKey[key] = append(byKey[key], entry) - } - for key, entries := range byKey { - wg.Add(1) - go func(key groupKey, entries []idEntry) { - sem <- struct{}{} - defer func() { <-sem; wg.Done() }() - locs := make([]*benchtop.RowLoc, len(entries)) - for i, entry := range entries { - locs[i] = entry.loc - } - Tlabel := ggraph.jsonkv.LabelLookup[key.TableId] - results, errors := ggraph.jsonkv.Tables[VTABLE_PREFIX+Tlabel].GetRows(locs, key.Section) - for i, entry := range entries { - if errors[i] != nil { - log.Errorf("GetVertexChannel: GetRows error for ID %s: %v", entry.lookup.ID, errors[i]) - continue - } - entry.lookup.Vertex = &gdbi.Vertex{ - Data: results[i], - Label: Tlabel, - Loaded: true, - ID: entries[i].lookup.ID, - } - out <- entry.lookup - } - }(key, entries) - } - wg.Wait() -} - -/* -func processBatchWithLabelCache(ggraph *Graph, batch []idEntry, out chan gdbi.ElementLookup) { - var wg sync.WaitGroup - sem := make(chan struct{}, 10) - bySection := make(map[uint16][]idEntry) - for _, entry := range batch { - bySection[entry.loc.Section] = append(bySection[entry.loc.Section], entry) - } - for sectionID, entries := range bySection { - wg.Add(1) - go func(sectionID uint16, entries []idEntry) { - sem <- struct{}{} - defer func() { <-sem; wg.Done() }() - sort.Slice(entries, func(i, j int) bool { - return entries[i].loc.Offset < entries[j].loc.Offset - }) - for _, entry := range entries { - v := gdbi.Vertex{ - ID: entry.lookup.ID, - Label: ggraph.jsonkv.LabelLookup[entry.loc.TableId], - } - data, err := ggraph.jsonkv.Tables[VTABLE_PREFIX+v.Label].GetRow(entry.loc) - if err != nil { - log.Errorf("GetVertexChannel: GetRow error for ID %s: %v", entry.lookup.ID, err) - continue - } - v.Data = data - v.Loaded = true - entry.lookup.Vertex = &v - out <- entry.lookup - } - }(sectionID, entries) - } - wg.Wait() - }*/ - -type lookup struct { - req gdbi.ElementLookup - key string -} - -// GetOutChannel process requests of vertex ids and find the connected vertices on outgoing edges -func (ggraph *Graph) GetOutChannel(ctx context.Context, reqChan chan gdbi.ElementLookup, load bool, emitNull bool, edgeLabels []string) chan gdbi.ElementLookup { - // Todo: implement bulk cache get + bulk get row to try to make this faster - lookupChan := make(chan lookup, 1000) - go func() { - defer close(lookupChan) - ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for req := range reqChan { - if req.IsSignal() { - // Use a select statement to send to lookupChan or check for cancellation - select { - case lookupChan <- lookup{req: req}: - case <-ctx.Done(): - return ctx.Err() // Stop if cancelled while trying to send - } - } else { - found := false - skeyPrefix := SrcEdgePrefix(req.ID) - for it.Seek(skeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), skeyPrefix); it.Next() { - _, _, dst, label := SrcEdgeKeyParse(it.Key()) - if len(edgeLabels) == 0 || setcmp.ContainsString(edgeLabels, label) { - lookupChan <- lookup{ - key: dst, - req: req, - } - found = true - } - } - if !found && emitNull { - lookupChan <- lookup{ - req: req, - key: "", - } - } - } - } - return nil - }) - }() - - o := make(chan gdbi.ElementLookup, 100) - go func() { - defer close(o) - for req := range lookupChan { - if req.req.IsSignal() { - select { - case o <- req.req: - case <-ctx.Done(): - return - } - } else { - if req.key != "" { - entry, err := ggraph.jsonkv.LocCache.Get(ctx, req.key) - if err != nil { - log.Errorf("GetOutChannel: PageCache.Get( error: %v", err) - continue - } - vLabel, ok := ggraph.jsonkv.LabelLookup[entry.TableId] - if !ok { - log.Errorf("GetOutChannel: Label not a string %s", vLabel) - continue - } - v := &gdbi.Vertex{ID: req.key, Label: vLabel} - if load { - v.Data, err = ggraph.jsonkv.Tables[VTABLE_PREFIX+v.Label].GetRow(entry) - if err != nil { - log.Errorf("GetOutChannel: GetRow on %s: %s error: %v", vLabel, req.key, err) - continue - } - v.Loaded = true - } else { - v.Data = map[string]any{} - } - req.req.Vertex = v - o <- req.req - } else { - req.req.Vertex = nil - o <- req.req - } - } - } - }() - return o -} - -// GetInChannel process requests of vertex ids and find the connected vertices on incoming edges -func (ggraph *Graph) GetInChannel(ctx context.Context, reqChan chan gdbi.ElementLookup, load bool, emitNull bool, edgeLabels []string) chan gdbi.ElementLookup { - o := make(chan gdbi.ElementLookup, 100) - go func() { - defer close(o) - ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for req := range reqChan { - if req.IsSignal() { - o <- req - } else { - found := false - dkeyPrefix := DstEdgePrefix(req.ID) - for it.Seek(dkeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), dkeyPrefix); it.Next() { - _, sid, _, label := DstEdgeKeyParse(it.Key()) - if len(edgeLabels) == 0 || setcmp.ContainsString(edgeLabels, label) { - entry, err := ggraph.jsonkv.LocCache.Get(ctx, sid) - if err != nil { - log.Errorf("GetInChannel: PageCache.Get( error: %v", err) - continue - } - - vLabel, ok := ggraph.jsonkv.LabelLookup[entry.TableId] - if !ok { - log.Errorf("GetInChannel Label lookup failed") - continue - } - - v := &gdbi.Vertex{ID: sid, Label: vLabel} - if load { - v.Data, err = ggraph.jsonkv.Tables[VTABLE_PREFIX+v.Label].GetRow(entry) - if err != nil { - log.Errorf("GetInChannel: GetRow on %s: %s error: %v", vLabel, sid, err) - continue - } - v.Loaded = true - } else { - v.Data = map[string]any{} - } - req.Vertex = v - o <- req - found = true - } - } - - if !found && emitNull { - req.Vertex = nil - o <- req - } - } - } - return nil - }) - }() - return o -} - -// GetOutEdgeChannel process requests of vertex ids and find the connected outgoing edges -func (ggraph *Graph) GetOutEdgeChannel(ctx context.Context, reqChan chan gdbi.ElementLookup, load bool, emitNull bool, edgeLabels []string) chan gdbi.ElementLookup { - o := make(chan gdbi.ElementLookup, 100) - go func() { - defer close(o) - ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for req := range reqChan { - if req.IsSignal() { - o <- req - } else { - found := false - skeyPrefix := SrcEdgePrefix(req.ID) - for it.Seek(skeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), skeyPrefix); it.Next() { - eid, src, dst, label := SrcEdgeKeyParse(it.Key()) - if len(edgeLabels) == 0 || setcmp.ContainsString(edgeLabels, label) { - e := gdbi.Edge{ - From: src, - To: dst, - Label: label, - ID: eid, - } - if load { - entry, err := ggraph.jsonkv.LocCache.Get(ctx, e.ID) - if err != nil { - log.Errorf("GetOutEdgeChannel: PageCache.Get( error: %v", err) - continue - } - e.Data, err = ggraph.jsonkv.Tables[ETABLE_PREFIX+e.Label].GetRow(entry) - if err != nil { - log.Errorf("GetOutEdgeChannel: GetRow error: %v", err) - continue - } - e.Loaded = true - } else { - e.Data = map[string]any{} - } - req.Edge = &e - o <- req - found = true - } - } - - if !found && emitNull { - req.Edge = nil - o <- req - } - } - } - return nil - }) - }() - return o -} - -// GetInEdgeChannel process requests of vertex ids and find the connected incoming edges -func (ggraph *Graph) GetInEdgeChannel(ctx context.Context, reqChan chan gdbi.ElementLookup, load bool, emitNull bool, edgeLabels []string) chan gdbi.ElementLookup { - o := make(chan gdbi.ElementLookup, 100) - go func() { - defer close(o) - ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for req := range reqChan { - if req.IsSignal() { - o <- req - } else { - found := false - dkeyPrefix := DstEdgePrefix(req.ID) - for it.Seek(dkeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), dkeyPrefix); it.Next() { - eid, src, dst, label := DstEdgeKeyParse(it.Key()) - if len(edgeLabels) == 0 || setcmp.ContainsString(edgeLabels, label) { - e := gdbi.Edge{ - ID: eid, - From: src, - To: dst, - Label: label, - } - if load { - entry, err := ggraph.jsonkv.LocCache.Get(ctx, e.ID) - if err != nil { - log.Errorf("GetInEdgeChannel: PageCache.Get( error: %v", err) - continue - } - //log.Debugln("IN EDGE LABEL: ", e.Label, "ENTRY: ", entry, "ID: ", e.ID) - - e.Data, err = ggraph.jsonkv.Tables[ETABLE_PREFIX+e.Label].GetRow(entry) - if err != nil { - log.Errorf("GetInEdgeChannel: GetRow error: %v", err) - continue - } - e.Loaded = true - } else { - e.Data = map[string]any{} - } - req.Edge = &e - o <- req - found = true - } - } - - if !found && emitNull { - req.Edge = nil - o <- req - } - } - } - return nil - }) - - }() - return o -} - -// GetEdge loads an edge given an id. It returns nil if not found -func (ggraph *Graph) GetEdge(id string, loadProp bool) *gdbi.Edge { - ekeyPrefix := EdgeKeyPrefix(id) - var e *gdbi.Edge - err := ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for it.Seek(ekeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), ekeyPrefix); it.Next() { - eid, src, dst, label := EdgeKeyParse(it.Key()) - e = &gdbi.Edge{ - ID: eid, - From: src, - To: dst, - Label: label, - } - if loadProp { - entry, err := ggraph.jsonkv.LocCache.Get(context.Background(), e.ID) - if err != nil { - log.Errorf("GetEdge: PageCache.Get( error: %v", err) - continue - } - - e.Data, err = ggraph.jsonkv.Tables[ETABLE_PREFIX+e.Label].GetRow(entry) - if err != nil { - log.Errorf("GetEdge: GetRow error: %v", err) - continue - } - e.Loaded = true - } else { - e.Data = map[string]any{} - } - } - return nil - }) - if err != nil { - return nil - } - return e -} - -// GetVertexList produces a channel of all edges in the graph -func (ggraph *Graph) GetVertexList(ctx context.Context, loadProp bool) <-chan *gdbi.Vertex { - o := make(chan *gdbi.Vertex, 100) - go func() { - defer close(o) - ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - vPrefix := VertexListPrefix() - for it.Seek(vPrefix); it.Valid() && bytes.HasPrefix(it.Key(), vPrefix); it.Next() { - select { - case <-ctx.Done(): - return nil - default: - } - byteLabel, err := it.Value() - if err != nil { - log.Errorf("GetVertexList it.Value() error: %s", err) - } - v := &gdbi.Vertex{ - ID: VertexKeyParse(it.Key()), - Label: string(byteLabel), - } - if loadProp { - entry, err := ggraph.jsonkv.LocCache.Get(ctx, v.ID) - if err != nil { - log.Errorf("GetVertexList: PageCache.Get on %s error: %s", v.ID, err) - continue - } - - v.Data, err = ggraph.jsonkv.Tables[VTABLE_PREFIX+v.Label].GetRow(entry) - if err != nil { - log.Errorf("GetVertexList: table.GetRow error: %s", err) - continue - } - v.Loaded = true - } else { - v.Data = map[string]any{} - } - o <- v - } - return nil - }) - }() - return o -} - -// ListVertexLabels returns a list of vertex types in the graph -func (ggraph *Graph) ListVertexLabels() ([]string, error) { - labels := []string{} - for i := range ggraph.jsonkv.GetLabels(false, true) { - labels = append(labels, i) - } - return labels, nil -} - -// ListEdgeLabels returns a list of edge types in the graph -func (ggraph *Graph) ListEdgeLabels() ([]string, error) { - labels := []string{} - for i := range ggraph.jsonkv.GetLabels(true, true) { - labels = append(labels, i) - } - return labels, nil -} - -// New Bulk Delete Function. Testing... -// BulkDel deletes vertices and edges in bulk. -func (ggraph *Graph) BulkDel(data *gdbi.DeleteData) error { - type keyBatch struct { - singles [][]byte - ranges [][2][]byte - posKeys [][]byte - } - - type itemInfo struct { - id string - label string - isEdge bool - tbl string - } - - type fieldInfo struct { - rKey []byte - field string - tbl string - id []byte - } - - const shardSize = 64 - const bufferSize = 8192 - numCpus := runtime.NumCPU() - ctx := context.Background() - - var bulkErr *multierror.Error - addErr := func(err error) { - if err != nil { - bulkErr = multierror.Append(bulkErr, err) - } - } - - // Sharded bitmap for edge deduplication (lock-free for reads) - type shard struct { - mu sync.Mutex - set map[string]struct{} - count uint32 // Atomic counter for seen edges - } - shards := make([]*shard, shardSize) - for i := range shards { - shards[i] = &shard{set: make(map[string]struct{}, bufferSize/shardSize)} - } - hasSeenEdge := func(eid string) bool { - h := fnv32a(eid) % uint32(shardSize) - shard := shards[h] - shard.mu.Lock() - defer shard.mu.Unlock() - if _, exists := shard.set[eid]; exists { - return true - } - shard.set[eid] = struct{}{} - atomic.AddUint32(&shard.count, 1) - return false - } - getSeenCount := func() uint64 { - var total uint64 - for _, shard := range shards { - total += uint64(atomic.LoadUint32(&shard.count)) - } - return total - } - - // Channels and wait groups - itemChan := make(chan itemInfo, bufferSize) - fieldChan := make(chan fieldInfo, bufferSize) - keyChan := make(chan keyBatch, bufferSize) - var prodWG, consWG, aggWG, fieldWG sync.WaitGroup - - // Aggregator for keys - var singles [][]byte - var ranges [][2][]byte - var posKeys [][]byte - aggWG.Add(1) - go func() { - defer aggWG.Done() - for batch := range keyChan { - select { - case <-ctx.Done(): - return - default: - singles = append(singles, batch.singles...) - ranges = append(ranges, batch.ranges...) - posKeys = append(posKeys, batch.posKeys...) - } - } - }() - - // Aggregator for fields - var allFields []fieldInfo - fieldWG.Add(1) - go func() { - defer fieldWG.Done() - for fi := range fieldChan { - allFields = append(allFields, fi) - } - }() - - // Workers for items - consWG.Add(numCpus) - for range numCpus { - go func() { - defer consWG.Done() - localBatch := keyBatch{posKeys: make([][]byte, 0, bufferSize)} - i := 0 - for item := range itemChan { - select { - case <-ctx.Done(): - return - default: - } - if i%100_000 == 0 && i != 0 { - log.Debugf("[BulkDel worker] processed %d items", i) - } - i++ - - // Fetch from page cache - loc, err := ggraph.jsonkv.LocCache.Get(ctx, item.id) - if err != nil { - addErr(err) - continue - } - - // Mark table for deletion - table, ok := ggraph.jsonkv.Tables[item.tbl] - if !ok { - addErr(fmt.Errorf("table %s not found", item.tbl)) - continue - } - if err := table.MarkDeleteTable(loc); err != nil { - addErr(err) - } - - // Position key - localBatch.posKeys = append(localBatch.posKeys, benchtop.NewPosKey(loc.TableId, []byte(item.id))) - ggraph.jsonkv.LocCache.Invalidate(item.id) - - // Send field infos - // - table, tableExists := ggraph.jsonkv.Tables[item.tbl] - if tableExists && len(table.Fields) > 0 { - for field := range table.Fields { - rKey := benchtop.RFieldKey(item.tbl, field, item.id) - select { - case fieldChan <- fieldInfo{rKey: rKey, field: field, tbl: item.tbl, id: []byte(item.id)}: - case <-ctx.Done(): - return - } - } - } - - if len(localBatch.posKeys) >= 500_000 { - keyChan <- localBatch - localBatch = keyBatch{posKeys: make([][]byte, 0, bufferSize)} - } - } - - if len(localBatch.posKeys) > 0 { - keyChan <- localBatch - } - }() - } - - // Prepare vertex producers - slices.Sort(data.Vertices) - vertexSlices := make([][]string, numCpus) - for i, vid := range data.Vertices { - vertexSlices[i%numCpus] = append(vertexSlices[i%numCpus], vid) - } - for i := range vertexSlices { - slices.Sort(vertexSlices[i]) - } - - for _, slice := range vertexSlices { - if len(slice) == 0 { - continue - } - prodWG.Add(1) - go func(slice []string) { - defer prodWG.Done() - localBatch := keyBatch{singles: make([][]byte, 0, 256), ranges: make([][2][]byte, 0, 256)} - - err := ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for _, vid := range slice { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - sPrefix := SrcEdgePrefix(vid) - if err := it.Seek(sPrefix); err != nil { - return err - } - if it.Valid() && bytes.HasPrefix(it.Key(), sPrefix) { - nextPrefix := upperBound(sPrefix) - if nextPrefix != nil { - localBatch.ranges = append(localBatch.ranges, [2][]byte{sPrefix, nextPrefix}) - } - for it.Valid() && bytes.HasPrefix(it.Key(), sPrefix) { - eid, sid, did, lbl := SrcEdgeKeyParse(it.Key()) - if !hasSeenEdge(eid) { - localBatch.singles = append(localBatch.singles, - EdgeKey(eid, sid, did, lbl), - bytes.Clone(it.Key()), - DstEdgeKey(eid, sid, did, lbl)) - select { - case itemChan <- itemInfo{id: eid, label: lbl, isEdge: true, tbl: ETABLE_PREFIX + lbl}: - case <-ctx.Done(): - return ctx.Err() - } - } - it.Next() - } - } - - dPrefix := DstEdgePrefix(vid) - if err := it.Seek(dPrefix); err != nil { - return err - } - if it.Valid() && bytes.HasPrefix(it.Key(), dPrefix) { - nextPrefix := upperBound(dPrefix) - if nextPrefix != nil { - localBatch.ranges = append(localBatch.ranges, [2][]byte{dPrefix, nextPrefix}) - } - for it.Valid() && bytes.HasPrefix(it.Key(), dPrefix) { - eid, sid, did, lbl := DstEdgeKeyParse(it.Key()) - if !hasSeenEdge(eid) { - localBatch.singles = append(localBatch.singles, - EdgeKey(eid, sid, did, lbl), - SrcEdgeKey(eid, sid, did, lbl), - bytes.Clone(it.Key())) - select { - case itemChan <- itemInfo{id: eid, label: lbl, isEdge: true, tbl: ETABLE_PREFIX + lbl}: - case <-ctx.Done(): - return ctx.Err() - } - } - it.Next() - } - } - - vkey := VertexKey(vid) - if err := it.Seek(vkey); err != nil { - return err - } - var label string - if it.Valid() && bytes.Equal(it.Key(), vkey) { - labelBytes, err := it.Value() - if err != nil { - return err - } - label = string(labelBytes) - } - localBatch.singles = append(localBatch.singles, vkey) - if label != "" { - select { - case itemChan <- itemInfo{id: vid, label: label, isEdge: false, tbl: VTABLE_PREFIX + label}: - case <-ctx.Done(): - return ctx.Err() - } - } - } - return nil - }) - addErr(err) - - if len(localBatch.singles) > 0 || len(localBatch.ranges) > 0 { - select { - case keyChan <- localBatch: - case <-ctx.Done(): - } - } - }(slice) - } - - // Prepare edge producers - slices.Sort(data.Edges) - edgeSlices := make([][]string, numCpus) - for i, eid := range data.Edges { - edgeSlices[i%numCpus] = append(edgeSlices[i%numCpus], eid) - } - for i := range edgeSlices { - slices.Sort(edgeSlices[i]) - } - - for _, slice := range edgeSlices { - if len(slice) == 0 { - continue - } - prodWG.Add(1) - go func(slice []string) { - defer prodWG.Done() - localBatch := keyBatch{singles: make([][]byte, 0, 12), ranges: make([][2][]byte, 0, 12)} - - err := ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for _, eid := range slice { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - if hasSeenEdge(eid) { - continue - } - - prefix := EdgeKeyPrefix(eid) - if err := it.Seek(prefix); err != nil { - return err - } - if it.Valid() && bytes.HasPrefix(it.Key(), prefix) { - nextPrefix := upperBound(prefix) - if nextPrefix != nil { - localBatch.ranges = append(localBatch.ranges, [2][]byte{prefix, nextPrefix}) - } - var label string - for it.Valid() && bytes.HasPrefix(it.Key(), prefix) { - _, sid, did, lbl := EdgeKeyParse(it.Key()) - label = lbl - localBatch.singles = append(localBatch.singles, - SrcEdgeKey(eid, sid, did, lbl), - DstEdgeKey(eid, sid, did, lbl)) - it.Next() - } - if label != "" { - select { - case itemChan <- itemInfo{id: eid, label: label, isEdge: true, tbl: ETABLE_PREFIX + label}: - case <-ctx.Done(): - return ctx.Err() - } - } - } - } - return nil - }) - addErr(err) - - if len(localBatch.singles) > 0 || len(localBatch.ranges) > 0 { - select { - case keyChan <- localBatch: - case <-ctx.Done(): - } - } - }(slice) - } - - // Close channels and wait - go func() { - prodWG.Wait() - close(itemChan) - }() - consWG.Wait() - close(keyChan) - aggWG.Wait() - close(fieldChan) - fieldWG.Wait() - - // Process field indices with single iterator - var indexDelKeys [][]byte - if len(allFields) > 0 { - sort.Slice(allFields, func(i, j int) bool { - return bytes.Compare(allFields[i].rKey, allFields[j].rKey) < 0 - }) - err := ggraph.jsonkv.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for _, fi := range allFields { - if err := it.Seek(fi.rKey); err != nil { - return err - } - if it.Valid() && bytes.Equal(it.Key(), fi.rKey) { - valueBytes, err := it.Value() - if err != nil { - return err - } - var fieldValue any - if err := sonic.ConfigFastest.Unmarshal(valueBytes, &fieldValue); err != nil { - return err - } - if fieldValue != nil { - fKey := benchtop.FieldKey(fi.field, fi.tbl, fieldValue, fi.id) - indexDelKeys = append(indexDelKeys, fKey, fi.rKey) - } - } - } - return nil - }) - addErr(err) - } - - // Chunked deletes with Pebble batch - chunked := func(singles [][]byte, ranges [][2][]byte, posKeys [][]byte, indexDelKeys [][]byte) error { - batch := ggraph.jsonkv.Pkv.Db.NewBatch() - defer batch.Close() - for _, k := range singles { - if err := batch.Delete(k, nil); err != nil { - return err - } - } - for _, r := range ranges { - if err := batch.DeleteRange(r[0], r[1], nil); err != nil { - return err - } - } - for _, k := range posKeys { - if err := batch.Delete(k, nil); err != nil { - return err - } - } - for _, k := range indexDelKeys { - if err := batch.Delete(k, nil); err != nil { - return err - } - } - return batch.Commit(pebble.Sync) - } - - // Perform deletes - ggraph.jsonkv.PebbleLock.Lock() - if err := chunked(singles, ranges, posKeys, indexDelKeys); err != nil { - addErr(err) - } - ggraph.ts.Touch(ggraph.graphID) - ggraph.jsonkv.PebbleLock.Unlock() - - log.Debugf("Total edges seen: %d", getSeenCount()) - return bulkErr.ErrorOrNil() -} - -// upperBound computes the tight upper bound for range delete -func upperBound(prefix []byte) []byte { - ub := make([]byte, len(prefix)) - copy(ub, prefix) - for i := len(ub) - 1; i >= 0; i-- { - if ub[i] < 0xFF { - ub[i]++ - return ub[:i+1] - } - } - return nil -} - -// fnv32a computes FNV-1a 32-bit hash -func fnv32a(s string) uint32 { - var h uint32 = 2166136261 - for i := range s { - h ^= uint32(s[i]) - h *= 16777619 - } - return h -} diff --git a/grids/graph_add.go b/grids/graph_add.go new file mode 100644 index 00000000..dfde2c75 --- /dev/null +++ b/grids/graph_add.go @@ -0,0 +1,521 @@ +package grids + +import ( + "bytes" + "context" + "fmt" + "maps" + "sort" + "sync" + "time" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/jsontable/tpath" + "github.com/bmeg/benchtop/pebblebulk" + "github.com/bmeg/grip/gdbi" + "github.com/bmeg/grip/grids/driver" + "github.com/bmeg/grip/grids/key" + "github.com/bmeg/grip/log" + "github.com/bytedance/sonic" +) + +func insertVertex(tx *pebblebulk.PebbleBulk, id uint64, vertex *gdbi.Vertex, loc *benchtop.RowLoc) error { + val := benchtop.EncodeVertexValue(vertex.Label, loc) + if err := tx.Set(key.VertexKey(id), val, nil); err != nil { + return fmt.Errorf("AddVertex Error %s", err) + } + return nil +} + +func (ggraph *Graph) indexVertices(vertices []*gdbi.Vertex, tx *pebblebulk.PebbleBulk) error { + byLabel := make(map[string][]*gdbi.Vertex) + for _, v := range vertices { + byLabel[v.Label] = append(byLabel[v.Label], v) + } + + for label, verts := range byLabel { + vertexLabel := key.VertexTablePrefix + label + tid, _ := ggraph.driver.TableDr.LookupTableID(vertexLabel) + ggraph.driver.Lock.Lock() + table, ok := ggraph.driver.TablesByID[tid] + ggraph.driver.Lock.Unlock() + + if !ok { + tStore, err := ggraph.driver.Get(vertexLabel) + if err != nil { + log.Debugf("Creating new table %s for label %s on graph %s", vertexLabel, label, ggraph.graphID) + tStore, err = ggraph.driver.New(vertexLabel, nil) + if err != nil { + return fmt.Errorf("indexVertices: %s", err) + } + } + table = tStore.(*driver.BackendTable) + } + + rows := make([]benchtop.Row, len(verts)) + ids := make([]string, len(verts)) + for i, v := range verts { + ids[i] = v.ID + rows[i] = benchtop.Row{ + Id: []byte(v.ID), + Data: v.Data, + TableID: table.TableId, + } + } + + uids, err := ggraph.driver.GetIDs(ids) + if err != nil { + return err + } + + rowLocs, err := table.AddRows(rows) + if err != nil { + return err + } + + for i, v := range verts { + if err := insertVertex(tx, uids[i], v, rowLocs[i]); err != nil { + return err + } + if err := tx.Set(benchtop.NewPosKey(table.TableId, []byte(v.ID)), benchtop.EncodeRowLoc(rowLocs[i]), nil); err != nil { + return err + } + // Index fields + if len(table.Fields) > 0 { + for field := range table.Fields { + if val := tpath.PathLookup(v.Data, field); val != nil { + err := tx.Set(benchtop.FieldKey(field, table.TableId, val, []byte(v.ID)), benchtop.EncodeRowLoc(rowLocs[i]), nil) + if err != nil { + return err + } + Mval, err := sonic.ConfigFastest.Marshal(val) + if err != nil { + return err + } + err = tx.Set(benchtop.RFieldKey(table.TableId, field, v.ID), Mval, nil) + if err != nil { + return err + } + } + } + } + } + } + return nil +} + +func insertEdge(tx *pebblebulk.PebbleBulk, eid, sid, did uint64, edge *gdbi.Edge, loc *benchtop.RowLoc) error { + val := benchtop.EncodeEdgeValue(edge.Label, loc, edge.Data) + if err := tx.Set(key.EdgeKey(eid, sid, did, edge.Label), val, nil); err != nil { + return err + } + if err := tx.Set(key.DstEdgeKey(eid, sid, did, edge.Label), val, nil); err != nil { + return err + } + if err := tx.Set(key.SrcEdgeKey(eid, sid, did, edge.Label), val, nil); err != nil { + return err + } + return nil +} + +func (ggraph *Graph) indexEdges(edges []*gdbi.Edge, tx *pebblebulk.PebbleBulk) error { + byLabel := make(map[string][]*gdbi.Edge) + for _, e := range edges { + if e != nil { + byLabel[e.Label] = append(byLabel[e.Label], e) + } + } + + for label, batch := range byLabel { + edgeLabel := key.EdgeTablePrefix + label + tid, _ := ggraph.driver.TableDr.LookupTableID(edgeLabel) + ggraph.driver.Lock.Lock() + table, ok := ggraph.driver.TablesByID[tid] + ggraph.driver.Lock.Unlock() + + if !ok { + tStore, err := ggraph.driver.Get(edgeLabel) + if err != nil { + log.Debugf("Creating new table %s for edge label %s on graph %s", edgeLabel, label, ggraph.graphID) + tStore, err = ggraph.driver.New(edgeLabel, nil) + if err != nil { + return fmt.Errorf("indexEdges: %s", err) + } + } + table = tStore.(*driver.BackendTable) + } + + rows := make([]benchtop.Row, len(batch)) + ids := make([]string, 0, len(batch)*3) + for i, e := range batch { + ids = append(ids, e.ID, e.From, e.To) + data := make(map[string]any, len(e.Data)+2) + for k, v := range e.Data { + data[k] = v + } + data["_from"] = e.From + data["_to"] = e.To + rows[i] = benchtop.Row{ + Id: []byte(e.ID), + TableID: table.TableId, + Data: data, + } + } + + uids, err := ggraph.driver.GetIDs(ids) + if err != nil { + return err + } + + locs, err := table.AddRows(rows) + if err != nil { + return fmt.Errorf("indexEdges: table.AddRows: %s", err) + } + + for i, e := range batch { + rowLoc := locs[i] + eid, sid, did := uids[i*3], uids[i*3+1], uids[i*3+2] + + // Update the structural keys with the location AND inlined data + if err := insertEdge(tx, eid, sid, did, e, rowLoc); err != nil { + return err + } + if err := tx.Set(benchtop.NewPosKey(table.TableId, []byte(e.ID)), benchtop.EncodeRowLoc(rowLoc), nil); err != nil { + return err + } + + if len(table.Fields) > 0 { + for field := range table.Fields { + if val := tpath.PathLookup(e.Data, field); val != nil { + err := tx.Set(benchtop.FieldKey(field, table.TableId, val, []byte(e.ID)), benchtop.EncodeRowLoc(rowLoc), nil) + if err != nil { + return err + } + eMarsh, err := sonic.ConfigFastest.Marshal(val) + if err != nil { + return err + } + err = tx.Set(benchtop.RFieldKey(table.TableId, field, e.ID), eMarsh, nil) + if err != nil { + return err + } + } + } + } + } + } + return nil +} + +// AddVertex adds an edge to the graph, if it already exists +// in the graph, it is replaced +func (ggraph *Graph) AddVertex(vertices []*gdbi.Vertex) error { + // indexVertices now handles the authoritative integrated key write. + return ggraph.driver.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + if err := ggraph.indexVertices(vertices, tx); err != nil { + log.Errorf("IndexVertices Error %s", err) + return err + } + ggraph.ts.Touch(ggraph.graphID) + return nil + }) +} + +// AddEdge adds an edge to the graph, if the id is not "" and in already exists +// in the graph, it is replaced +func (ggraph *Graph) AddEdge(edges []*gdbi.Edge) error { + // indexEdges now handles the authoritative integrated key write. + return ggraph.driver.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + if err := ggraph.indexEdges(edges, tx); err != nil { + log.Errorf("IndexEdges Error %s", err) + return err + } + ggraph.ts.Touch(ggraph.graphID) + return nil + }) +} + +func (ggraph *Graph) BulkAdd(stream <-chan *gdbi.GraphElement) error { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + const workerBatchSize = 4000 + const writeBatchSize = 4000 + + snap := ggraph.driver.Pkv.Db.NewSnapshot() + defer snap.Close() + + type preparedItem struct { + elem *gdbi.GraphElement + row *benchtop.Row + uid uint64 + suid uint64 + duid uint64 + dbKey []byte + } + + const bufSize = 8192 + ready := make(chan *preparedItem, bufSize) + + var wg sync.WaitGroup + wg.Add(1) + + go func() { + defer wg.Done() + defer close(ready) + it, err := snap.NewIter(nil) + if err != nil { + log.Errorf("BulkAdd worker iterator init failed: %v", err) + return + } + defer it.Close() + + // ─── Worker Buffer & Batching ────────────────────────── + batch := make([]*gdbi.GraphElement, 0, workerBatchSize) + seen := make(map[uint64]struct{}) + tableIDCache := make(map[string]uint16) + + processBatch := func(b []*gdbi.GraphElement) error { + if len(b) == 0 { + return nil + } + batchStart := time.Now() + + // 1. Collect ALL unique IDs in this batch to resolve at once + uniqueIDs := make(map[string]struct{}) + for _, elem := range b { + if elem == nil { + continue + } + if elem.Vertex != nil { + uniqueIDs[elem.Vertex.ID] = struct{}{} + } else if elem.Edge != nil { + uniqueIDs[elem.Edge.ID] = struct{}{} + uniqueIDs[elem.Edge.From] = struct{}{} + uniqueIDs[elem.Edge.To] = struct{}{} + } + } + + // 2. Resolve IDs in bulk + idList := make([]string, 0, len(uniqueIDs)) + for id := range uniqueIDs { + idList = append(idList, id) + } + sort.Strings(idList) + + idVals, err := ggraph.driver.GetIDs(idList) + if err != nil { + return err + } + idResolveElapsed := time.Since(batchStart) + + // 3. Map string -> uint64 for fast lookup + idMap := make(map[string]uint64, len(idList)) + for i, s := range idList { + idMap[s] = idVals[i] + } + + // 4. Transform elements into preparedItems + items := make([]*preparedItem, 0, len(b)) + + for _, elem := range b { + if elem == nil { + continue + } + + // Determine table info + var tName string + if elem.Vertex != nil { + tName = key.VertexTablePrefix + elem.Vertex.Label + } else if elem.Edge != nil { + tName = key.EdgeTablePrefix + elem.Edge.Label + } + + var tid uint16 + if tName != "" { + if cachedTID, ok := tableIDCache[tName]; ok { + tid = cachedTID + } else { + ts, err := ggraph.driver.GetOrLoadTable(tName) + if err != nil { + tStore, nerr := ggraph.driver.New(tName, nil) + if nerr == nil && tStore != nil { + if bt, ok := tStore.(*driver.BackendTable); ok { + tid = bt.TableId + } + } + } else if ts != nil { + tid = ts.TableId + } + if tid != 0 { + tableIDCache[tName] = tid + } + } + } + + var row *benchtop.Row + var uid, suid, duid uint64 + var dbKey []byte + + if elem.Vertex != nil { + uid = idMap[elem.Vertex.ID] + dbKey = key.VertexKey(uid) + data := make(map[string]any, len(elem.Vertex.Data)+1) + maps.Copy(data, elem.Vertex.Data) + data["_label"] = elem.Vertex.Label + row = &benchtop.Row{ + Id: []byte(elem.Vertex.ID), + TableID: tid, + Data: data, + } + } else if elem.Edge != nil { + uid = idMap[elem.Edge.ID] + suid = idMap[elem.Edge.From] + duid = idMap[elem.Edge.To] + dbKey = key.EdgeKey(uid, suid, duid, elem.Edge.Label) + data := make(map[string]any, len(elem.Edge.Data)+3) + maps.Copy(data, elem.Edge.Data) + data["_from"] = elem.Edge.From + data["_to"] = elem.Edge.To + data["_label"] = elem.Edge.Label + row = &benchtop.Row{ + Id: []byte(elem.Edge.ID), + TableID: tid, + Data: data, + } + } + + if row != nil { + items = append(items, &preparedItem{ + elem: elem, + row: row, + uid: uid, + suid: suid, + duid: duid, + dbKey: dbKey, + }) + } + } + + // 5. Sort items by UID to maximize Snapshot.Get locality (block cache efficiency) + sort.Slice(items, func(i, j int) bool { + return items[i].uid < items[j].uid + }) + + transformElapsed := time.Since(batchStart) + + // 6. Check Snapshot and Emit + for _, item := range items { + if item.dbKey != nil { + if _, ok := seen[item.uid]; ok { + continue + } + if it.SeekGE(item.dbKey) && it.Valid() && bytes.Equal(it.Key(), item.dbKey) { + seen[item.uid] = struct{}{} + continue // Skip, graph element already exists + } + seen[item.uid] = struct{}{} + } + select { + case <-ctx.Done(): + return ctx.Err() + case ready <- item: + } + } + + totalElapsed := time.Since(batchStart) + if totalElapsed > 2*time.Second { + log.Infof("BulkAdd worker slow batch=%d uniqueIDs=%d idResolve=%s transform=%s total=%s", len(b), len(idList), idResolveElapsed.Round(time.Millisecond), transformElapsed.Round(time.Millisecond), totalElapsed.Round(time.Millisecond)) + } + + return nil + } + + for { + select { + case <-ctx.Done(): + return + case elem, ok := <-stream: + if !ok { + // Channel closed, flush remaining + if len(batch) > 0 { + _ = processBatch(batch) + } + return + } + batch = append(batch, elem) + if len(batch) >= workerBatchSize { + if err := processBatch(batch); err != nil { + log.Errorf("BulkAdd worker error: %v", err) + return + } + batch = batch[:0] + } + } + } + }() + + // ───────────────────────────────────────────── + // 3. Writer: Batching and I/O (Main thread) + // ───────────────────────────────────────────── + itemBuffer := make([]*preparedItem, 0, writeBatchSize) + + // Removed global snap and it, they will be created per batch. + // snap := ggraph.driver.Pkv.Db.NewSnapshot() + // defer snap.Close() + // + // it, err := snap.NewIter(nil) + // if err != nil { + // return err + // } + // defer it.Close() + + // Removed global 'seen' map. + // seen := make(map[string]struct{}) + + processBatch := func(batch []*preparedItem) error { + if len(batch) == 0 { + return nil + } + + return ggraph.driver.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + // Group rows for the driver + rows := make([]*benchtop.Row, len(batch)) + for i, item := range batch { + rows[i] = item.row + } + + // Bulk Load JSON/Index rows + // Pass nil for snap to disable the redundant (and slower) check in the driver. + // We have already verified uniqueness above using the optimized Get() check. + if err := ggraph.driver.BulkLoadBatch(tx, rows, nil); err != nil { + return err + } + return nil + }) + } + + var writeErr error + for item := range ready { + itemBuffer = append(itemBuffer, item) + if len(itemBuffer) >= writeBatchSize { + if err := processBatch(itemBuffer); err != nil { + writeErr = err + cancel() + break + } + itemBuffer = itemBuffer[:0] + } + } + + if writeErr == nil && len(itemBuffer) > 0 { + if err := processBatch(itemBuffer); err != nil { + writeErr = err + cancel() + } + } + + wg.Wait() + ggraph.ts.Touch(ggraph.graphID) + return writeErr +} diff --git a/grids/graph_delete.go b/grids/graph_delete.go new file mode 100644 index 00000000..f99cff01 --- /dev/null +++ b/grids/graph_delete.go @@ -0,0 +1,1171 @@ +package grids + +import ( + "bytes" + "context" + "encoding/binary" + "fmt" + "os" + "runtime" + "slices" + "sort" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/pebblebulk" + "github.com/bmeg/grip/gdbi" + "github.com/bmeg/grip/grids/driver" + "github.com/bmeg/grip/grids/key" + "github.com/bmeg/grip/log" + "github.com/bytedance/sonic" + "github.com/cockroachdb/pebble" + multierror "github.com/hashicorp/go-multierror" +) + +func (ggraph *Graph) DelVertex(id string) error { + uid, _ := ggraph.driver.GetID(id) + vid := key.VertexKey(uid) + skeyPrefix := key.SrcEdgePrefix(uid) + dkeyPrefix := key.DstEdgePrefix(uid) + + delKeys := make([][]byte, 0, 1000) + type edgeDelInfo struct { + label string + loc *benchtop.RowLoc + } + edgesToDelete := make(map[string]edgeDelInfo) + + var bulkErr *multierror.Error + err := ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(skeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), skeyPrefix); it.Next() { + skey := it.Key() + euid, suid, duid, label := key.SrcEdgeKeyParse(skey) + eid, _ := ggraph.driver.TranslateID(euid) + + if ggraph.tempDeletedEdges != nil { + if _, exists := ggraph.tempDeletedEdges[eid]; exists { + continue + } + } + if _, exists := edgesToDelete[eid]; exists { + continue + } + + ekey := key.EdgeKey(euid, suid, duid, label) + dkey := key.DstEdgeKey(euid, suid, duid, label) + delKeys = append(delKeys, ekey, skey, dkey) + + eVal, _ := it.Value() + _, loc, _ := benchtop.DecodeEdgeValue(eVal) + edgesToDelete[eid] = edgeDelInfo{label: label, loc: loc} + } + + for it.Seek(dkeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), dkeyPrefix); it.Next() { + dkey := it.Key() + euid, suid, duid, label := key.DstEdgeKeyParse(dkey) + eid, _ := ggraph.driver.TranslateID(euid) + + if ggraph.tempDeletedEdges != nil { + if _, exists := ggraph.tempDeletedEdges[eid]; exists { + continue + } + } + if _, exists := edgesToDelete[eid]; exists { + continue + } + + ekey := key.EdgeKey(euid, suid, duid, label) + skey := key.SrcEdgeKey(euid, suid, duid, label) + delKeys = append(delKeys, ekey, skey, dkey) + + eVal, _ := it.Value() + _, loc, _ := benchtop.DecodeEdgeValue(eVal) + edgesToDelete[eid] = edgeDelInfo{label: label, loc: loc} + } + return nil + }) + + if err != nil { + return err + } + + for eid, info := range edgesToDelete { + if err := ggraph.DeleteAnyRow(eid, info.label, true, info.loc); err != nil { + bulkErr = multierror.Append(bulkErr, err) + } + + if ggraph.tempDeletedEdges != nil { + ggraph.tempDeletedEdges[eid] = struct{}{} + } + } + + var vlbl string + var vloc *benchtop.RowLoc + _ = ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + val, err := it.Get(vid) + if err == nil { + vlbl, vloc = benchtop.DecodeVertexValue(val) + } + return nil + }) + + if vloc != nil { + if err := ggraph.DeleteAnyRow(id, vlbl, false, vloc); err != nil { + bulkErr = multierror.Append(bulkErr, err) + } + } + + err = ggraph.driver.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + if err := tx.Delete(vid, nil); err != nil { + return err + } + for _, k := range delKeys { + if err := tx.Delete(k, nil); err != nil { + return err + } + } + return nil + }) + if err != nil { + bulkErr = multierror.Append(bulkErr, err) + } + + return bulkErr.ErrorOrNil() +} + +func (ggraph *Graph) DelEdge(eid string) error { + uid, _ := ggraph.driver.GetID(eid) + ekeyPrefix := key.EdgeKeyPrefix(uid) + var ekey []byte + var eVal []byte + err := ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(ekeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), ekeyPrefix); it.Next() { + ekey = bytes.Clone(it.Key()) + v, _ := it.Value() + eVal = bytes.Clone(v) + } + return nil + }) + if err != nil { + return err + } + + if ekey == nil { + return nil + } + + euid, suid, duid, lbl := key.EdgeKeyParse(ekey) + skey := key.SrcEdgeKey(euid, suid, duid, lbl) + dkey := key.DstEdgeKey(euid, suid, duid, lbl) + + var bulkErr *multierror.Error + err = ggraph.driver.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + tx.Delete(ekey, nil) + tx.Delete(skey, nil) + tx.Delete(dkey, nil) + return nil + }) + + if err != nil { + bulkErr = multierror.Append(bulkErr, err) + } + + _, loc, _ := benchtop.DecodeEdgeValue(eVal) + if loc != nil { + if err := ggraph.DeleteAnyRow(eid, lbl, true, loc); err != nil { + bulkErr = multierror.Append(bulkErr, err) + } + } + + return bulkErr.ErrorOrNil() +} + +// BulkDel deletes vertices and edges in bulk. +func (ggraph *Graph) BulkDel(data *gdbi.DeleteData) error { + type keyBatch struct { + singles [][]byte + ranges [][2][]byte + posKeys [][]byte + } + + type fieldInfo struct { + field string + id []byte + rKey []byte + tableId uint16 + } + + type itemInfo struct { + id string + label string + isEdge bool + tableId uint16 + loc *benchtop.RowLoc + } + type rowDeleteTask struct { + table *driver.BackendTable + id string + loc *benchtop.RowLoc + } + + const shardSize = 64 + const bufferSize = 8192 + numCpus := runtime.NumCPU() + ctx := context.Background() + start := time.Now() + slices.Sort(data.Vertices) + slices.Sort(data.Edges) + log.Infof("BulkDel start graph=%s vertices=%d edges=%d", ggraph.graphID, len(data.Vertices), len(data.Edges)) + var stageMu sync.RWMutex + stage := "enumerate" + setStage := func(s string) { + stageMu.Lock() + stage = s + stageMu.Unlock() + } + getStage := func() string { + stageMu.RLock() + defer stageMu.RUnlock() + return stage + } + inlineRowGC := strings.EqualFold(strings.TrimSpace(os.Getenv("GRIDS_BULK_DELETE_ROW_GC_MODE")), "inline") + if inlineRowGC { + log.Infof("BulkDel row GC mode graph=%s mode=inline", ggraph.graphID) + } else { + log.Infof("BulkDel row GC mode graph=%s mode=deferred", ggraph.graphID) + } + verboseBulkDel := envTruthy("GRIDS_BULK_DELETE_DEBUG") + stallStackDump := envTruthy("GRIDS_BULK_DELETE_STALL_STACK") + bulkDelLogf := func(format string, args ...any) { + if verboseBulkDel { + log.Infof(format, args...) + return + } + log.Debugf(format, args...) + } + + var bulkErr *multierror.Error + var bulkErrMu sync.Mutex + var missingRowsMu sync.Mutex + missingRowsByTable := map[uint16]map[string]struct{}{} + var producerRuns int64 + var itemQueued int64 + var itemProcessed int64 + var keyBatchQueued int64 + var keyBatchProcessed int64 + var keySinglesProcessed int64 + var keyRangesProcessed int64 + var keyPosProcessed int64 + var fieldQueued int64 + var fieldProcessed int64 + var missingRowTargets int64 + var unknownTableScans int64 + var rowDeleteQueued int64 + var rowDeleteDone int64 + var rowDeleteInFlight int64 + var rowDeleteErr int64 + var rowDeleteSections int64 + var workersInLookup int64 + var workersInTableLoad int64 + var workersInFieldEmit int64 + // Channels and wait groups + itemChan := make(chan itemInfo, bufferSize) + fieldChan := make(chan fieldInfo, bufferSize) + keyChan := make(chan keyBatch, bufferSize) + var prodWG, consWG, aggWG, fieldWG sync.WaitGroup + var rowDeleteTasksMu sync.Mutex + rowDeleteTasks := make([]rowDeleteTask, 0, 1024) + + progressStop := make(chan struct{}) + defer close(progressStop) + addErr := func(err error) { + if err != nil { + bulkErrMu.Lock() + bulkErr = multierror.Append(bulkErr, err) + bulkErrMu.Unlock() + } + } + // Sharded bitmap for edge deduplication (lock-free for reads) + type shard struct { + mu sync.Mutex + set map[string]struct{} + count uint32 // Atomic counter for seen edges + } + shards := make([]*shard, shardSize) + for i := range shards { + shards[i] = &shard{set: make(map[string]struct{}, bufferSize/shardSize)} + } + hasSeenEdge := func(eid string) bool { + h := fnv32a(eid) % uint32(shardSize) + shard := shards[h] + shard.mu.Lock() + defer shard.mu.Unlock() + if _, exists := shard.set[eid]; exists { + return true + } + shard.set[eid] = struct{}{} + atomic.AddUint32(&shard.count, 1) + return false + } + getSeenCount := func() uint64 { + var total uint64 + for _, shard := range shards { + total += uint64(atomic.LoadUint32(&shard.count)) + } + return total + } + go func() { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + var prevItems int64 + var prevBatches int64 + var prevFields int64 + stallTicks := 0 + for { + select { + case <-progressStop: + return + case <-ticker.C: + curItems := atomic.LoadInt64(&itemProcessed) + curBatches := atomic.LoadInt64(&keyBatchProcessed) + curFields := atomic.LoadInt64(&fieldProcessed) + if curItems == prevItems && curBatches == prevBatches && curFields == prevFields && atomic.LoadInt64(&producerRuns) > 0 { + stallTicks++ + } else { + stallTicks = 0 + } + prevItems = curItems + prevBatches = curBatches + prevFields = curFields + + curStage := getStage() + + bulkDelLogf( + "BulkDel progress graph=%s stage=%s elapsed=%s producers=%d items=%d/%d itemBacklog=%d keyBatches=%d/%d keyParts[s=%d r=%d p=%d] fields=%d/%d rowDelete[sections=%d q=%d done=%d inFlight=%d err=%d] missingRows=%d unknownScans=%d seenEdges=%d workerState[lookup=%d load=%d emit=%d] chanDepth[item=%d/%d key=%d/%d field=%d/%d] stallTicks=%d", + ggraph.graphID, + curStage, + time.Since(start).Round(time.Second), + atomic.LoadInt64(&producerRuns), + curItems, + atomic.LoadInt64(&itemQueued), + atomic.LoadInt64(&itemQueued)-curItems, + atomic.LoadInt64(&keyBatchProcessed), + atomic.LoadInt64(&keyBatchQueued), + atomic.LoadInt64(&keySinglesProcessed), + atomic.LoadInt64(&keyRangesProcessed), + atomic.LoadInt64(&keyPosProcessed), + atomic.LoadInt64(&fieldQueued), + curFields, + atomic.LoadInt64(&rowDeleteSections), + atomic.LoadInt64(&rowDeleteQueued), + atomic.LoadInt64(&rowDeleteDone), + atomic.LoadInt64(&rowDeleteInFlight), + atomic.LoadInt64(&rowDeleteErr), + atomic.LoadInt64(&missingRowTargets), + atomic.LoadInt64(&unknownTableScans), + getSeenCount(), + atomic.LoadInt64(&workersInLookup), + atomic.LoadInt64(&workersInTableLoad), + atomic.LoadInt64(&workersInFieldEmit), + len(itemChan), + cap(itemChan), + len(keyChan), + cap(keyChan), + len(fieldChan), + cap(fieldChan), + stallTicks, + ) + if stallTicks >= 3 { + log.Warningf( + "BulkDel appears stalled graph=%s stage=%s stallTicks=%d producers=%d backlog=%d rowDelete[sections=%d q=%d done=%d inFlight=%d err=%d] workerState[lookup=%d load=%d emit=%d] chanDepth[item=%d/%d key=%d/%d field=%d/%d]", + ggraph.graphID, + curStage, + stallTicks, + atomic.LoadInt64(&producerRuns), + atomic.LoadInt64(&itemQueued)-curItems, + atomic.LoadInt64(&rowDeleteSections), + atomic.LoadInt64(&rowDeleteQueued), + atomic.LoadInt64(&rowDeleteDone), + atomic.LoadInt64(&rowDeleteInFlight), + atomic.LoadInt64(&rowDeleteErr), + atomic.LoadInt64(&workersInLookup), + atomic.LoadInt64(&workersInTableLoad), + atomic.LoadInt64(&workersInFieldEmit), + len(itemChan), + cap(itemChan), + len(keyChan), + cap(keyChan), + len(fieldChan), + cap(fieldChan), + ) + if stallStackDump && (stallTicks == 3 || stallTicks%6 == 0) { + buf := make([]byte, 1<<20) + n := runtime.Stack(buf, true) + log.Warningf("BulkDel stall goroutine dump graph=%s stallTicks=%d\n%s", ggraph.graphID, stallTicks, string(buf[:n])) + } + } + } + } + }() + + // Aggregator for keys + var singles [][]byte + var ranges [][2][]byte + var posKeys [][]byte + aggWG.Add(1) + go func() { + defer aggWG.Done() + for batch := range keyChan { + atomic.AddInt64(&keyBatchProcessed, 1) + atomic.AddInt64(&keySinglesProcessed, int64(len(batch.singles))) + atomic.AddInt64(&keyRangesProcessed, int64(len(batch.ranges))) + atomic.AddInt64(&keyPosProcessed, int64(len(batch.posKeys))) + select { + case <-ctx.Done(): + return + default: + singles = append(singles, batch.singles...) + ranges = append(ranges, batch.ranges...) + posKeys = append(posKeys, batch.posKeys...) + } + } + }() + + // Aggregator for fields + var allFields []fieldInfo + fieldWG.Add(1) + go func() { + defer fieldWG.Done() + for fi := range fieldChan { + allFields = append(allFields, fi) + atomic.AddInt64(&fieldProcessed, 1) + } + }() + + enqueueItem := func(item itemInfo, source string) error { + const warnEvery = 10 * time.Second + timer := time.NewTimer(warnEvery) + defer timer.Stop() + for { + select { + case itemChan <- item: + atomic.AddInt64(&itemQueued, 1) + return nil + case <-ctx.Done(): + return ctx.Err() + case <-timer.C: + log.Warningf( + "BulkDel blocked enqueue item graph=%s source=%s id=%s label=%s isEdge=%t chanDepth=%d/%d backlog=%d workerState[lookup=%d load=%d emit=%d]", + ggraph.graphID, + source, + item.id, + item.label, + item.isEdge, + len(itemChan), + cap(itemChan), + atomic.LoadInt64(&itemQueued)-atomic.LoadInt64(&itemProcessed), + atomic.LoadInt64(&workersInLookup), + atomic.LoadInt64(&workersInTableLoad), + atomic.LoadInt64(&workersInFieldEmit), + ) + timer.Reset(warnEvery) + } + } + } + + enqueueField := func(fi fieldInfo, source string) error { + const warnEvery = 10 * time.Second + timer := time.NewTimer(warnEvery) + defer timer.Stop() + for { + select { + case fieldChan <- fi: + atomic.AddInt64(&fieldQueued, 1) + return nil + case <-ctx.Done(): + return ctx.Err() + case <-timer.C: + log.Warningf( + "BulkDel blocked enqueue field graph=%s source=%s tableID=%d field=%s chanDepth=%d/%d backlog=%d", + ggraph.graphID, + source, + fi.tableId, + fi.field, + len(fieldChan), + cap(fieldChan), + atomic.LoadInt64(&fieldQueued)-atomic.LoadInt64(&fieldProcessed), + ) + timer.Reset(warnEvery) + } + } + } + + enqueueKeyBatch := func(batch keyBatch, source string) error { + if len(batch.singles) == 0 && len(batch.ranges) == 0 && len(batch.posKeys) == 0 { + return nil + } + const warnEvery = 10 * time.Second + timer := time.NewTimer(warnEvery) + defer timer.Stop() + for { + select { + case keyChan <- batch: + atomic.AddInt64(&keyBatchQueued, 1) + return nil + case <-ctx.Done(): + return ctx.Err() + case <-timer.C: + log.Warningf( + "BulkDel blocked enqueue keyBatch graph=%s source=%s parts[s=%d r=%d p=%d] chanDepth=%d/%d backlog=%d", + ggraph.graphID, + source, + len(batch.singles), + len(batch.ranges), + len(batch.posKeys), + len(keyChan), + cap(keyChan), + atomic.LoadInt64(&keyBatchQueued)-atomic.LoadInt64(&keyBatchProcessed), + ) + timer.Reset(warnEvery) + } + } + } + + // Workers for items + consWG.Add(numCpus) + for range numCpus { + go func() { + defer consWG.Done() + localBatch := keyBatch{posKeys: make([][]byte, 0, bufferSize)} + i := 0 + for item := range itemChan { + atomic.AddInt64(&itemProcessed, 1) + select { + case <-ctx.Done(): + return + default: + } + if i%100_000 == 0 && i != 0 { + bulkDelLogf("[BulkDel worker] processed %d items", i) + } + i++ + + // Use the RowLoc passed in itemInfo + loc := item.loc + if loc == nil { + continue + } + + // Resolve table and mark for deletion if it exists + var table *driver.BackendTable + atomic.AddInt64(&workersInLookup, 1) + ggraph.driver.Lock.RLock() + table = ggraph.driver.TablesByID[loc.TableId] + ggraph.driver.Lock.RUnlock() + atomic.AddInt64(&workersInLookup, -1) + + if table == nil { + atomic.AddInt64(&workersInTableLoad, 1) + // Try to load + if info, err := ggraph.driver.TableDr.GetTableInfo(loc.TableId); err == nil { + table, _ = ggraph.driver.GetOrLoadTable(info.Name) + } + atomic.AddInt64(&workersInTableLoad, -1) + } + + hasTable := (table != nil) + if hasTable && table.TableId != loc.TableId { + log.Warningf("Logic error: table mismatch %d vs %d", table.TableId, loc.TableId) + } + + // Use authoritative ID + currentTableId := loc.TableId + + // Position key matches the new P | TableId | rowID format + localBatch.posKeys = append(localBatch.posKeys, benchtop.NewPosKey(currentTableId, []byte(item.id))) + + // Deleting from the underlying table storage (dedicated stage for reliability) + if hasTable && inlineRowGC { + rowDeleteTasksMu.Lock() + rowDeleteTasks = append(rowDeleteTasks, rowDeleteTask{table: table, id: item.id, loc: loc}) + rowDeleteTasksMu.Unlock() + atomic.AddInt64(&rowDeleteQueued, 1) + } + + // Send field infos + if hasTable && len(table.Fields) > 0 { + atomic.AddInt64(&workersInFieldEmit, 1) + for field := range table.Fields { + rKey := benchtop.RFieldKey(currentTableId, field, item.id) + if err := enqueueField(fieldInfo{rKey: rKey, field: field, tableId: currentTableId, id: []byte(item.id)}, "item_worker"); err != nil { + atomic.AddInt64(&workersInFieldEmit, -1) + return + } + } + atomic.AddInt64(&workersInFieldEmit, -1) + } else if !hasTable { + // Avoid O(rows * tableFields) scans: queue missing table+row and resolve + // fields in one batched reverse-index scan per table after workers finish. + missingRowsMu.Lock() + if _, ok := missingRowsByTable[currentTableId]; !ok { + missingRowsByTable[currentTableId] = map[string]struct{}{} + } + if _, exists := missingRowsByTable[currentTableId][item.id]; !exists { + missingRowsByTable[currentTableId][item.id] = struct{}{} + atomic.AddInt64(&missingRowTargets, 1) + } + missingRowsMu.Unlock() + } + + if len(localBatch.posKeys) >= 500_000 { + if err := enqueueKeyBatch(localBatch, "item_worker"); err != nil { + return + } + localBatch = keyBatch{posKeys: make([][]byte, 0, bufferSize)} + } + } + + if len(localBatch.posKeys) > 0 { + if err := enqueueKeyBatch(localBatch, "item_worker_flush"); err != nil { + return + } + } + }() + } + + // Prepare vertex producers + vertexSlices := make([][]string, numCpus) + for i, vid := range data.Vertices { + vertexSlices[i%numCpus] = append(vertexSlices[i%numCpus], vid) + } + for i := range vertexSlices { + slices.Sort(vertexSlices[i]) + } + + for _, slice := range vertexSlices { + if len(slice) == 0 { + continue + } + prodWG.Add(1) + go func(slice []string) { + defer prodWG.Done() + atomic.AddInt64(&producerRuns, 1) + defer atomic.AddInt64(&producerRuns, -1) + localBatch := keyBatch{singles: make([][]byte, 0, 256), ranges: make([][2][]byte, 0, 256)} + + err := ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for _, vid := range slice { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + uid, _ := ggraph.driver.GetID(vid) + sPrefix := key.SrcEdgePrefix(uid) + if err := it.Seek(sPrefix); err != nil { + return err + } + if it.Valid() && bytes.HasPrefix(it.Key(), sPrefix) { + nextPrefix := upperBound(sPrefix) + if nextPrefix != nil { + localBatch.ranges = append(localBatch.ranges, [2][]byte{sPrefix, nextPrefix}) + } + for it.Valid() && bytes.HasPrefix(it.Key(), sPrefix) { + euid, suid, duid, lbl := key.SrcEdgeKeyParse(it.Key()) + eid, _ := ggraph.driver.TranslateID(euid) + eVal, _ := it.Value() + _, loc, _ := benchtop.DecodeEdgeValue(eVal) + if !hasSeenEdge(eid) { + localBatch.singles = append(localBatch.singles, + key.EdgeKey(euid, suid, duid, lbl), + bytes.Clone(it.Key()), + key.DstEdgeKey(euid, suid, duid, lbl)) + tid, _ := ggraph.driver.TableDr.LookupTableID("e_" + lbl) + if err := enqueueItem(itemInfo{id: eid, label: lbl, isEdge: true, tableId: tid, loc: loc}, "vertex_src_edge"); err != nil { + return err + } + } + it.Next() + } + } + + dPrefix := key.DstEdgePrefix(uid) + if err := it.Seek(dPrefix); err != nil { + return err + } + if it.Valid() && bytes.HasPrefix(it.Key(), dPrefix) { + nextPrefix := upperBound(dPrefix) + if nextPrefix != nil { + localBatch.ranges = append(localBatch.ranges, [2][]byte{dPrefix, nextPrefix}) + } + for it.Valid() && bytes.HasPrefix(it.Key(), dPrefix) { + euid, suid, duid, lbl := key.DstEdgeKeyParse(it.Key()) + eid, _ := ggraph.driver.TranslateID(euid) + eVal, _ := it.Value() + _, loc, _ := benchtop.DecodeEdgeValue(eVal) + if !hasSeenEdge(eid) { + localBatch.singles = append(localBatch.singles, + key.EdgeKey(euid, suid, duid, lbl), + key.SrcEdgeKey(euid, suid, duid, lbl), + bytes.Clone(it.Key())) + tid, _ := ggraph.driver.TableDr.LookupTableID("e_" + lbl) + if err := enqueueItem(itemInfo{id: eid, label: lbl, isEdge: true, tableId: tid, loc: loc}, "vertex_dst_edge"); err != nil { + return err + } + } + it.Next() + } + } + + vkey := key.VertexKey(uid) + if err := it.Seek(vkey); err != nil { + return err + } + var vlabel string + var vloc *benchtop.RowLoc + if it.Valid() && bytes.Equal(it.Key(), vkey) { + vBytes, err := it.Value() + if err != nil { + return err + } + vlabel, vloc = benchtop.DecodeVertexValue(vBytes) + } + localBatch.singles = append(localBatch.singles, vkey) + if vlabel != "" { + tid, _ := ggraph.driver.TableDr.LookupTableID("v_" + vlabel) + if err := enqueueItem(itemInfo{id: vid, label: vlabel, isEdge: false, tableId: tid, loc: vloc}, "vertex_record"); err != nil { + return err + } + } + } + return nil + }) + addErr(err) + + if len(localBatch.singles) > 0 || len(localBatch.ranges) > 0 { + if err := enqueueKeyBatch(localBatch, "vertex_producer_flush"); err != nil { + addErr(err) + } + } + }(slice) + } + + // Prepare edge producers + edgeSlices := make([][]string, numCpus) + for i, eid := range data.Edges { + edgeSlices[i%numCpus] = append(edgeSlices[i%numCpus], eid) + } + for i := range edgeSlices { + slices.Sort(edgeSlices[i]) + } + + for _, slice := range edgeSlices { + if len(slice) == 0 { + continue + } + prodWG.Add(1) + go func(slice []string) { + defer prodWG.Done() + atomic.AddInt64(&producerRuns, 1) + defer atomic.AddInt64(&producerRuns, -1) + localBatch := keyBatch{singles: make([][]byte, 0, 12), ranges: make([][2][]byte, 0, 12)} + + err := ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for _, eid := range slice { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + if hasSeenEdge(eid) { + continue + } + + uid, _ := ggraph.driver.GetID(eid) + prefix := key.EdgeKeyPrefix(uid) + if err := it.Seek(prefix); err != nil { + return err + } + if it.Valid() && bytes.HasPrefix(it.Key(), prefix) { + nextPrefix := upperBound(prefix) + if nextPrefix != nil { + localBatch.ranges = append(localBatch.ranges, [2][]byte{prefix, nextPrefix}) + } + var eLabel string + var eLoc *benchtop.RowLoc + for it.Valid() && bytes.HasPrefix(it.Key(), prefix) { + euid, suid, duid, lbl := key.EdgeKeyParse(it.Key()) + eLabel = lbl + eVal, _ := it.Value() + _, eLoc, _ = benchtop.DecodeEdgeValue(eVal) + localBatch.singles = append(localBatch.singles, + key.SrcEdgeKey(euid, suid, duid, lbl), + key.DstEdgeKey(euid, suid, duid, lbl)) + it.Next() + } + if eLabel != "" { + tid, _ := ggraph.driver.TableDr.LookupTableID("e_" + eLabel) + if err := enqueueItem(itemInfo{id: eid, label: eLabel, isEdge: true, tableId: tid, loc: eLoc}, "edge_producer"); err != nil { + return err + } + } + } + } + return nil + }) + addErr(err) + + if len(localBatch.singles) > 0 || len(localBatch.ranges) > 0 { + if err := enqueueKeyBatch(localBatch, "edge_producer_flush"); err != nil { + addErr(err) + } + } + }(slice) + } + + // Close channels and wait + go func() { + prodWG.Wait() + bulkDelLogf("BulkDel producers complete graph=%s queuedItems=%d queuedKeyBatches=%d", ggraph.graphID, atomic.LoadInt64(&itemQueued), atomic.LoadInt64(&keyBatchQueued)) + close(itemChan) + }() + bulkDelLogf("BulkDel waiting workers graph=%s", ggraph.graphID) + consWG.Wait() + bulkDelLogf("BulkDel item workers complete graph=%s processedItems=%d", ggraph.graphID, atomic.LoadInt64(&itemProcessed)) + close(keyChan) + aggWG.Wait() + bulkDelLogf("BulkDel key aggregation complete graph=%s singles=%d ranges=%d posKeys=%d", ggraph.graphID, len(singles), len(ranges), len(posKeys)) + + // Phase 2: row tombstones (single writer per section for reliability). + deleteTasks := make([]rowDeleteTask, 0) + if inlineRowGC { + setStage("row_delete") + rowDeleteTasksMu.Lock() + deleteTasks = append(deleteTasks, rowDeleteTasks...) + rowDeleteTasksMu.Unlock() + sort.Slice(deleteTasks, func(i, j int) bool { + li := deleteTasks[i].loc + lj := deleteTasks[j].loc + if li.Section != lj.Section { + return li.Section < lj.Section + } + if deleteTasks[i].table.TableId != deleteTasks[j].table.TableId { + return deleteTasks[i].table.TableId < deleteTasks[j].table.TableId + } + return deleteTasks[i].id < deleteTasks[j].id + }) + + resumeFrom := 0 + atomic.StoreInt64(&rowDeleteQueued, int64(len(deleteTasks))) + atomic.StoreInt64(&rowDeleteDone, int64(resumeFrom)) + + var rowDeleteWG sync.WaitGroup + var rowDeleteMu sync.Mutex + rowDeleteBySection := make(map[uint16]chan rowDeleteTask) + dispatchRowDelete := func(task rowDeleteTask) bool { + sectionID := task.loc.Section + rowDeleteMu.Lock() + ch, ok := rowDeleteBySection[sectionID] + if !ok { + ch = make(chan rowDeleteTask, 1024) + rowDeleteBySection[sectionID] = ch + atomic.AddInt64(&rowDeleteSections, 1) + rowDeleteWG.Add(1) + go func(sectionID uint16, taskCh chan rowDeleteTask) { + defer rowDeleteWG.Done() + for task := range taskCh { + atomic.AddInt64(&rowDeleteInFlight, 1) + ggraph.driver.TableDr.InvalidateLoc(task.loc.TableId, task.id) + t0 := time.Now() + if err := task.table.DeleteRow(task.loc, []byte(task.id)); err != nil { + atomic.AddInt64(&rowDeleteErr, 1) + addErr(fmt.Errorf("failed to delete row %s from table %s section=%d: %w", task.id, task.table.Name, sectionID, err)) + } + if d := time.Since(t0); d > 3*time.Second { + log.Warningf("BulkDel slow row tombstone graph=%s section=%d table=%s id=%s duration=%s", ggraph.graphID, sectionID, task.table.Name, task.id, d) + } + atomic.AddInt64(&rowDeleteDone, 1) + atomic.AddInt64(&rowDeleteInFlight, -1) + } + }(sectionID, ch) + } + rowDeleteMu.Unlock() + const warnEvery = 10 * time.Second + timer := time.NewTimer(warnEvery) + defer timer.Stop() + for { + select { + case ch <- task: + return true + case <-ctx.Done(): + return false + case <-timer.C: + log.Warningf( + "BulkDel blocked row tombstone dispatch graph=%s section=%d chanDepth=%d/%d rowDelete[queued=%d done=%d inFlight=%d err=%d]", + ggraph.graphID, + sectionID, + len(ch), + cap(ch), + atomic.LoadInt64(&rowDeleteQueued), + atomic.LoadInt64(&rowDeleteDone), + atomic.LoadInt64(&rowDeleteInFlight), + atomic.LoadInt64(&rowDeleteErr), + ) + timer.Reset(warnEvery) + } + } + } + + for i := resumeFrom; i < len(deleteTasks); i++ { + if !dispatchRowDelete(deleteTasks[i]) { + addErr(ctx.Err()) + break + } + } + rowDeleteMu.Lock() + sectionCount := len(rowDeleteBySection) + for _, ch := range rowDeleteBySection { + close(ch) + } + rowDeleteMu.Unlock() + bulkDelLogf("BulkDel waiting row tombstone workers graph=%s sections=%d", ggraph.graphID, sectionCount) + rowDeleteWG.Wait() + log.Infof( + "BulkDel row tombstone stage complete graph=%s sections=%d queued=%d done=%d inFlight=%d err=%d", + ggraph.graphID, + atomic.LoadInt64(&rowDeleteSections), + atomic.LoadInt64(&rowDeleteQueued), + atomic.LoadInt64(&rowDeleteDone), + atomic.LoadInt64(&rowDeleteInFlight), + atomic.LoadInt64(&rowDeleteErr), + ) + } else { + setStage("row_gc_deferred") + log.Infof("BulkDel row tombstone stage deferred graph=%s mode=deferred", ggraph.graphID) + } + + // Resolve reverse index keys for rows whose table metadata was not loaded. + setStage("resolve_missing_fields") + missingRowsMu.Lock() + missingTableCount := len(missingRowsByTable) + missingRowsMu.Unlock() + if missingTableCount > 0 { + bulkDelLogf("BulkDel resolving unknown-table reverse indexes graph=%s tables=%d", ggraph.graphID, missingTableCount) + missingRowsMu.Lock() + for tableID, rowSet := range missingRowsByTable { + if len(rowSet) == 0 { + continue + } + atomic.AddInt64(&unknownTableScans, 1) + tableScanStart := time.Now() + matched := 0 + bulkDelLogf("BulkDel unknown-table scan start graph=%s tableID=%d rowTargets=%d", ggraph.graphID, tableID, len(rowSet)) + tableIDBytes := binary.LittleEndian.AppendUint16(nil, tableID) + rPrefix := bytes.Join([][]byte{benchtop.RFieldPrefix, tableIDBytes}, benchtop.FieldSep) + err := ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(rPrefix); it.Valid() && bytes.HasPrefix(it.Key(), rPrefix); it.Next() { + keyBytes := bytes.Clone(it.Key()) + // key format: R tableID(2 bytes) field rowID + if len(keyBytes) <= len(rPrefix) || keyBytes[len(rPrefix)] != benchtop.FieldSep[0] { + continue + } + tail := keyBytes[len(rPrefix)+1:] + fieldEnd := bytes.IndexByte(tail, benchtop.FieldSep[0]) + if fieldEnd < 0 || fieldEnd+1 >= len(tail) { + continue + } + field := string(tail[:fieldEnd]) + rowID := string(tail[fieldEnd+1:]) + if _, ok := rowSet[rowID]; !ok { + continue + } + matched++ + if err := enqueueField(fieldInfo{rKey: benchtop.RFieldKey(tableID, field, rowID), field: field, tableId: tableID, id: []byte(rowID)}, "unknown_table_scan"); err != nil { + return err + } + } + return nil + }) + addErr(err) + bulkDelLogf( + "BulkDel unknown-table scan done graph=%s tableID=%d rowTargets=%d matched=%d duration=%s", + ggraph.graphID, + tableID, + len(rowSet), + matched, + time.Since(tableScanStart), + ) + } + missingRowsMu.Unlock() + } + + close(fieldChan) + fieldWG.Wait() + bulkDelLogf("BulkDel field aggregation complete graph=%s fields=%d", ggraph.graphID, len(allFields)) + + // Process field indices with single iterator + setStage("collect_index_keys") + var indexDelKeys [][]byte + if len(allFields) > 0 { + sort.Slice(allFields, func(i, j int) bool { + return bytes.Compare(allFields[i].rKey, allFields[j].rKey) < 0 + }) + err := ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for _, fi := range allFields { + if err := it.Seek(fi.rKey); err != nil { + return err + } + if it.Valid() && bytes.Equal(it.Key(), fi.rKey) { + valueBytes, err := it.Value() + if err != nil { + return err + } + var fieldValue any + if err := sonic.ConfigFastest.Unmarshal(valueBytes, &fieldValue); err != nil { + return err + } + if fieldValue != nil { + fKey := benchtop.FieldKey(fi.field, fi.tableId, fieldValue, fi.id) + indexDelKeys = append(indexDelKeys, fKey, fi.rKey) + } + } + } + return nil + }) + addErr(err) + } + bulkDelLogf("BulkDel index key collection complete graph=%s indexDelKeys=%d", ggraph.graphID, len(indexDelKeys)) + + // Chunked deletes with periodic commit so very large deletes don't block on one huge sync. + chunked := func(singles [][]byte, ranges [][2][]byte, posKeys [][]byte, indexDelKeys [][]byte) error { + const maxOpsPerBatch = 200_000 + + batch := ggraph.driver.Pkv.Db.NewBatch() + ops := 0 + commits := 0 + flush := func(force bool) error { + if !force && ops < maxOpsPerBatch { + return nil + } + if ops == 0 { + return nil + } + if err := batch.Commit(pebble.Sync); err != nil { + return err + } + if err := batch.Close(); err != nil { + return err + } + commits++ + if commits%5 == 0 { + bulkDelLogf("BulkDel chunk commit graph=%s commits=%d", ggraph.graphID, commits) + } + batch = ggraph.driver.Pkv.Db.NewBatch() + ops = 0 + return nil + } + defer batch.Close() + + for _, k := range singles { + if err := batch.Delete(k, nil); err != nil { + return err + } + ops++ + if err := flush(false); err != nil { + return err + } + } + for _, r := range ranges { + if err := batch.DeleteRange(r[0], r[1], nil); err != nil { + return err + } + ops++ + if err := flush(false); err != nil { + return err + } + } + for _, k := range posKeys { + if err := batch.Delete(k, nil); err != nil { + return err + } + ops++ + if err := flush(false); err != nil { + return err + } + } + for _, k := range indexDelKeys { + if err := batch.Delete(k, nil); err != nil { + return err + } + ops++ + if err := flush(false); err != nil { + return err + } + } + if err := flush(true); err != nil { + return err + } + if commits > 0 { + bulkDelLogf("BulkDel chunking complete graph=%s commits=%d", ggraph.graphID, commits) + } + return nil + } + + // Perform deletes + setStage("commit") + bulkDelLogf("BulkDel acquiring pebble write lock graph=%s", ggraph.graphID) + lockWaitStart := time.Now() + ggraph.driver.PebbleLock.Lock() + bulkDelLogf("BulkDel pebble write lock acquired graph=%s wait=%s", ggraph.graphID, time.Since(lockWaitStart)) + commitStart := time.Now() + if err := chunked(singles, ranges, posKeys, indexDelKeys); err != nil { + addErr(err) + } + ggraph.ts.Touch(ggraph.graphID) + ggraph.driver.PebbleLock.Unlock() + log.Infof("BulkDel commit complete graph=%s duration=%s", ggraph.graphID, time.Since(commitStart)) + + bulkDelLogf("Total edges seen: %d", getSeenCount()) + outErr := bulkErr.ErrorOrNil() + setStage("done") + log.Infof("BulkDel done graph=%s totalDuration=%s err=%v", ggraph.graphID, time.Since(start), outErr) + return outErr +} + +func envTruthy(name string) bool { + v := strings.TrimSpace(strings.ToLower(os.Getenv(name))) + switch v { + case "1", "true", "yes", "on": + return true + default: + return false + } +} + +// upperBound computes the tight upper bound for range delete +func upperBound(prefix []byte) []byte { + ub := make([]byte, len(prefix)) + copy(ub, prefix) + for i := len(ub) - 1; i >= 0; i-- { + if ub[i] < 0xFF { + ub[i]++ + return ub[:i+1] + } + } + return nil +} + +// fnv32a computes FNV-1a 32-bit hash +func fnv32a(s string) uint32 { + var h uint32 = 2166136261 + for i := range s { + h ^= uint32(s[i]) + h *= 16777619 + } + return h +} diff --git a/grids/graph_get.go b/grids/graph_get.go new file mode 100644 index 00000000..8aba28e9 --- /dev/null +++ b/grids/graph_get.go @@ -0,0 +1,563 @@ +package grids + +import ( + "bytes" + "context" + "errors" + "fmt" + "time" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/pebblebulk" + "github.com/bmeg/grip/gdbi" + "github.com/bmeg/grip/grids/key" + "github.com/bmeg/grip/log" + "github.com/cockroachdb/pebble" +) + +type idEntry struct { + lookup gdbi.ElementLookup + loc *benchtop.RowLoc + label string + fields []string + data map[string]any + idx int +} + +type lookupPriv struct { + loc *benchtop.RowLoc + fields []string + data map[string]any + uid uint64 +} + +const resolveBatchSize = 20000 + +func (ggraph *Graph) resolveBatch(ctx context.Context, batch []gdbi.ElementLookup, out chan gdbi.ElementLookup, isEdge bool) { + if len(batch) == 0 { + return + } + start := time.Now() + + var withLoc []idEntry + var missingIdx []int + var keys []string + var uidMissingIdx []int + var uidMissingVals []uint64 + + for i, id := range batch { + var entry *benchtop.RowLoc + var fields []string + var label string + if id.Priv != nil { + if loc, ok := id.Priv.(*benchtop.RowLoc); ok { + entry = loc + } else if loc, ok := id.Priv.(benchtop.RowLoc); ok { + entry = &loc + } else if priv, ok := id.Priv.(*lookupPriv); ok && priv != nil { + entry = priv.loc + fields = priv.fields + if priv.data != nil { + withLoc = append(withLoc, idEntry{lookup: id, loc: entry, label: label, fields: fields, data: priv.data, idx: i}) + continue + } + if entry == nil && priv.uid != 0 && !isEdge { + uidMissingIdx = append(uidMissingIdx, i) + uidMissingVals = append(uidMissingVals, priv.uid) + continue + } + } else if priv, ok := id.Priv.(lookupPriv); ok { + entry = priv.loc + fields = priv.fields + if priv.data != nil { + withLoc = append(withLoc, idEntry{lookup: id, loc: entry, label: label, fields: fields, data: priv.data, idx: i}) + continue + } + if entry == nil && priv.uid != 0 && !isEdge { + uidMissingIdx = append(uidMissingIdx, i) + uidMissingVals = append(uidMissingVals, priv.uid) + continue + } + } + } + if id.Vertex != nil { + label = id.Vertex.Get().Label + } else if id.Edge != nil { + label = id.Edge.Get().Label + } + if entry != nil { + if id.Edge != nil && id.Edge.Get() != nil && id.Edge.Get().Data != nil { + withLoc = append(withLoc, idEntry{lookup: id, loc: entry, label: label, fields: fields, data: id.Edge.Get().Data, idx: i}) + continue + } + if label == "" { + if t, err := ggraph.driver.GetTableByID(entry.TableId); err == nil { + label = t.Label + } + } + withLoc = append(withLoc, idEntry{lookup: id, loc: entry, label: label, fields: fields, idx: i}) + } else { + missingIdx = append(missingIdx, i) + keys = append(keys, id.ID) + } + } + + if len(uidMissingVals) > 0 { + locsByUID, err := ggraph.driver.GetVertexLocByUIDBatch(ctx, uidMissingVals) + if err != nil { + log.Errorf("resolveBatch: GetVertexLocByUIDBatch error: %v", err) + } + for j, idx := range uidMissingIdx { + id := batch[idx] + info := locsByUID[uidMissingVals[j]] + if info != nil { + var fields []string + if id.Priv != nil { + if priv, ok := id.Priv.(*lookupPriv); ok && priv != nil { + fields = priv.fields + } else if priv, ok := id.Priv.(lookupPriv); ok { + fields = priv.fields + } + } + withLoc = append(withLoc, idEntry{lookup: id, loc: info.Loc, label: info.Label, fields: fields, data: info.Data, idx: idx}) + } else { + // Fallback to string-ID lookup + missingIdx = append(missingIdx, idx) + keys = append(keys, id.ID) + } + } + } + + if len(keys) > 0 { + locs, err := ggraph.driver.GetLocBatch(ctx, keys) + if !isEdge { + locs, err = ggraph.driver.GetVertexLocBatch(ctx, keys) + } + if err != nil { + log.Errorf("resolveBatch: GetLocBatch error: %v", err) + } + for _, idx := range missingIdx { + id := batch[idx] + info := locs[id.ID] + if info != nil { + var fields []string + if id.Priv != nil { + if priv, ok := id.Priv.(*lookupPriv); ok && priv != nil { + fields = priv.fields + } else if priv, ok := id.Priv.(lookupPriv); ok { + fields = priv.fields + } + } + withLoc = append(withLoc, idEntry{lookup: id, loc: info.Loc, label: info.Label, fields: fields, data: info.Data, idx: idx}) + } + } + } + + if len(withLoc) > 0 { + if isEdge { + ggraph.processEdgeBatch(withLoc, out) + } else { + ggraph.processVertexBatch(withLoc, out) + } + } + if len(batch) >= 1000 { + unresolved := len(batch) - len(withLoc) + log.Debugf("resolveBatch done isEdge=%v input=%d withLoc=%d unresolved=%d elapsed=%s", isEdge, len(batch), len(withLoc), unresolved, time.Since(start).Round(time.Millisecond)) + } +} + +func projectRowMap(row map[string]any, fields []string) map[string]any { + if row == nil { + return nil + } + if len(fields) == 0 { + delete(row, "_id") + delete(row, "_label") + delete(row, "_from") + delete(row, "_to") + return row + } + out := make(map[string]any, len(fields)) + for _, f := range fields { + if f == "_id" || f == "_label" || f == "_from" || f == "_to" { + continue + } + if v, ok := row[f]; ok { + out[f] = v + } + } + return out +} + +func (ggraph *Graph) GetVertexChannel(ctx context.Context, ids chan gdbi.ElementLookup, load bool) chan gdbi.ElementLookup { + out := make(chan gdbi.ElementLookup, 100) + go func() { + defer close(out) + if !load { + for id := range ids { + if ctx.Err() != nil { + return + } + if id.IsSignal() { + select { + case <-ctx.Done(): + return + case out <- id: + } + continue + } + id.Vertex = &gdbi.Vertex{ID: id.ID, Label: labelFromElementID(id.ID)} + select { + case <-ctx.Done(): + return + case out <- id: + } + } + return + } + var batch []gdbi.ElementLookup + for id := range ids { + if ctx.Err() != nil { + return + } + if id.IsSignal() { + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, out, false) + batch = nil + } + select { + case <-ctx.Done(): + return + case out <- id: + } + continue + } + batch = append(batch, id) + if len(batch) >= resolveBatchSize { + ggraph.resolveBatch(ctx, batch, out, false) + batch = nil + } + } + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, out, false) + } + }() + return out +} + +func (ggraph *Graph) processVertexBatch(batch []idEntry, out chan gdbi.ElementLookup) { + byTable := make(map[uint16][]idEntry) + for _, entry := range batch { + byTable[entry.loc.TableId] = append(byTable[entry.loc.TableId], entry) + } + + maxIdx := -1 + for _, entry := range batch { + if entry.idx > maxIdx { + maxIdx = entry.idx + } + } + ordered := make([]*gdbi.ElementLookup, maxIdx+1) + + for tid, entries := range byTable { + locs := make([]*benchtop.RowLoc, len(entries)) + for i, entry := range entries { + locs[i] = entry.loc + } + + table, err := ggraph.driver.GetTableByID(tid) + var results []map[string]any + var errors []error + if err != nil || table == nil { + log.Errorf("processVertexBatch: table ID %d not found", tid) + errors = make([]error, len(entries)) + for i := range errors { + errors[i] = fmt.Errorf("table not found") + } + continue + } else { + results, errors = table.GetRows(locs) + } + + for i, entry := range entries { + id := entry.lookup + if id.Vertex == nil { + id.Vertex = &gdbi.Vertex{ID: id.ID, Label: entry.label} + } + var res map[string]any + if entry.data != nil { + res = entry.data + } else if errors != nil && errors[i] == nil { + res = results[i] + } else { + continue + } + id.Vertex.Get().Data = projectRowMap(res, entry.fields) + id.Vertex.Get().Loaded = true + ordered[entry.idx] = &id + } + } + + for _, v := range ordered { + if v != nil { + out <- *v + } + } +} + +func (ggraph *Graph) processEdgeBatch(batch []idEntry, out chan gdbi.ElementLookup) { + byTable := make(map[uint16][]idEntry) + for _, entry := range batch { + byTable[entry.loc.TableId] = append(byTable[entry.loc.TableId], entry) + } + + maxIdx := -1 + for _, entry := range batch { + if entry.idx > maxIdx { + maxIdx = entry.idx + } + } + ordered := make([]*gdbi.ElementLookup, maxIdx+1) + + for tid, entries := range byTable { + locs := make([]*benchtop.RowLoc, len(entries)) + for i, entry := range entries { + locs[i] = entry.loc + } + table, err := ggraph.driver.GetTableByID(tid) + var results []map[string]any + var errors []error + if err != nil || table == nil { + log.Errorf("processEdgeBatch: table ID %d not found", tid) + errors = make([]error, len(entries)) + for i := range errors { + errors[i] = fmt.Errorf("table not found") + } + continue + } else { + results, errors = table.GetRows(locs) + } + for i, entry := range entries { + id := entry.lookup + if id.Edge == nil { + id.Edge = &gdbi.Edge{ID: id.ID, Label: entry.label} + } + var res map[string]any + if entry.data != nil { + res = entry.data + } else if errors != nil && errors[i] == nil { + res = results[i] + } else { + continue + } + if from, ok := res["_from"].(string); ok { + id.Edge.Get().From = from + } + if to, ok := res["_to"].(string); ok { + id.Edge.Get().To = to + } + if label, ok := res["_label"].(string); ok { + id.Edge.Get().Label = label + } + id.Edge.Get().Data = projectRowMap(res, entry.fields) + if id.Edge.Get().From == "" { + log.Errorf("processEdgeBatch: edge %s missing _from", id.ID) + continue + } + if id.Edge.Get().To == "" { + log.Errorf("processEdgeBatch: edge %s missing _to", id.ID) + continue + } + id.Edge.Get().Loaded = true + ordered[entry.idx] = &id + } + } + + for _, v := range ordered { + if v != nil { + out <- *v + } + } +} + +// GetVertex loads a vertex given an id. It returns a nil if not found +func (ggraph *Graph) GetVertex(id string, loadProp bool) *gdbi.Vertex { + uid, _ := ggraph.driver.GetID(id) + vkey := key.VertexKey(uid) + val, closer, err := ggraph.driver.Pkv.Get(vkey) + if err != nil { + if errors.Is(err, pebble.ErrNotFound) { + return nil + } + log.Errorf("GetVertex Pkv.Get error: %v", err) + return nil + } + defer closer.Close() + + label, loc := benchtop.DecodeVertexValue(val) + v := &gdbi.Vertex{ + ID: id, + Label: label, + } + + if loadProp { + tableStore, terr := ggraph.driver.GetOrLoadTable("v_" + v.Label) + if terr != nil { + log.Errorf("GetVertex: table load error: %v", terr) + return nil + } + v.Data, err = tableStore.GetRow(loc) + if err != nil { + log.Errorf("GetVertex: table.GetRow( error: %v", err) + return nil + } + + v.Data = projectRowMap(v.Data, nil) + + v.Loaded = true + } else { + v.Data = map[string]any{} + } + return v +} + +// GetEdge loads an edge given an id. It returns nil if not found +func (ggraph *Graph) GetEdge(id string, loadProp bool) *gdbi.Edge { + uid, _ := ggraph.driver.GetID(id) + ekeyPrefix := key.EdgeKeyPrefix(uid) + var e *gdbi.Edge + var byteVal []byte + err := ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(ekeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), ekeyPrefix); it.Next() { + euid, suid, duid, label := key.EdgeKeyParse(it.Key()) + eid, _ := ggraph.driver.TranslateID(euid) + src, _ := ggraph.driver.TranslateID(suid) + dst, _ := ggraph.driver.TranslateID(duid) + + byteVal, _ = it.Value() + e = &gdbi.Edge{ + ID: eid, + From: src, + To: dst, + Label: label, + } + if loadProp { + lbl, loc, data := benchtop.DecodeEdgeValue(byteVal) + if data != nil { + e.Data = projectRowMap(data, nil) + e.Loaded = true + return nil + } + + if loc == nil { + log.Errorf("GetEdge: integrated key missing RowLoc for %s", e.ID) + continue + } + + tableStore, terr := ggraph.driver.GetOrLoadTable("e_" + lbl) + if terr != nil { + log.Errorf("GetEdge: table load error: %v", terr) + continue + } + + var gerr error + e.Data, gerr = tableStore.GetRow(loc) + if gerr != nil { + log.Errorf("GetEdge: GetRow error: %v", gerr) + continue + } + if from, ok := e.Data["_from"].(string); ok { + e.From = from + } + if to, ok := e.Data["_to"].(string); ok { + e.To = to + } + if label, ok := e.Data["_label"].(string); ok { + e.Label = label + } + e.Data = projectRowMap(e.Data, nil) + e.Loaded = true + + } else { + e.Data = map[string]any{} + } + } + return nil + }) + if err != nil { + return nil + } + return e +} + +// GetVertexList produces a channel of all edges in the graph +func (ggraph *Graph) GetVertexList(ctx context.Context, loadProp bool) <-chan *gdbi.Vertex { + o := make(chan *gdbi.Vertex, 100) + go func() { + defer close(o) + ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + vPrefix := key.VertexListPrefix() + for it.Seek(vPrefix); it.Valid() && bytes.HasPrefix(it.Key(), vPrefix); it.Next() { + select { + case <-ctx.Done(): + return nil + default: + } + byteVal, err := it.Value() + if err != nil { + log.Errorf("GetVertexList it.Value() error: %s", err) + } + label, loc := benchtop.DecodeVertexValue(byteVal) + vid, _ := ggraph.driver.TranslateID(key.VertexKeyParse(it.Key())) + v := &gdbi.Vertex{ + ID: vid, + Label: label, + } + if loadProp { + if loc == nil { + log.Errorf("GetVertexList: integrated key missing RowLoc for %s", v.ID) + continue + } + + tableStore, terr := ggraph.driver.GetOrLoadTable("v_" + v.Label) + if terr != nil { + log.Errorf("GetVertexList: table load error: %v", terr) + continue + } + v.Data, err = tableStore.GetRow(loc) + if err != nil { + log.Errorf("GetVertexList: table.GetRow error: %s", err) + continue + } + v.Data = projectRowMap(v.Data, nil) + + v.Loaded = true + + } else { + v.Data = map[string]any{} + } + o <- v + } + return nil + }) + }() + return o +} + +// ListVertexLabels returns a list of vertex types in the graph +func (ggraph *Graph) ListVertexLabels() ([]string, error) { + labels := []string{} + for i := range ggraph.driver.GetLabels(false, true) { + labels = append(labels, i) + } + return labels, nil +} + +// ListEdgeLabels returns a list of edge types in the graph +func (ggraph *Graph) ListEdgeLabels() ([]string, error) { + labels := []string{} + for i := range ggraph.driver.GetLabels(true, true) { + labels = append(labels, i) + } + return labels, nil +} diff --git a/grids/graph_traverse.go b/grids/graph_traverse.go new file mode 100644 index 00000000..c3e14dc5 --- /dev/null +++ b/grids/graph_traverse.go @@ -0,0 +1,251 @@ +package grids + +import ( + "bytes" + "context" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/pebblebulk" + "github.com/bmeg/grip/gdbi" + "github.com/bmeg/grip/grids/key" + "github.com/bmeg/grip/util/setcmp" +) + +// GetOutChannel process requests of vertex ids and find the connected vertices on outgoing edges +func (ggraph *Graph) GetOutChannel(ctx context.Context, reqChan chan gdbi.ElementLookup, load bool, emitNull bool, edgeLabels []string) chan gdbi.ElementLookup { + o := make(chan gdbi.ElementLookup, 100) + go func() { + defer close(o) + ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + var batch []gdbi.ElementLookup + for req := range reqChan { + if req.IsSignal() { + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, o, false) + batch = nil + } + o <- req + continue + } + found := false + uid, _ := ggraph.driver.GetID(req.ID) + skeyPrefix := key.SrcEdgePrefix(uid) + for it.Seek(skeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), skeyPrefix); it.Next() { + _, _, duid, label := key.SrcEdgeKeyParse(it.Key()) + if len(edgeLabels) == 0 || setcmp.ContainsString(edgeLabels, label) { + dst, _ := ggraph.driver.TranslateID(duid) + if !load { + req.Vertex = &gdbi.Vertex{ID: dst, Label: labelFromElementID(dst)} + o <- req + } else { + batch = append(batch, gdbi.ElementLookup{ID: dst, Ref: req.Ref, Priv: lookupPriv{uid: duid}}) + if len(batch) >= 1000 { + ggraph.resolveBatch(ctx, batch, o, false) + batch = nil + } + } + found = true + } + } + if !found && emitNull { + req.Vertex = nil + o <- req + } + } + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, o, false) + } + return nil + }) + }() + return o +} + +// GetInChannel process requests of vertex ids and find the connected vertices on incoming edges +func (ggraph *Graph) GetInChannel(ctx context.Context, reqChan chan gdbi.ElementLookup, load bool, emitNull bool, edgeLabels []string) chan gdbi.ElementLookup { + o := make(chan gdbi.ElementLookup, 100) + go func() { + defer close(o) + ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + var batch []gdbi.ElementLookup + for req := range reqChan { + if req.IsSignal() { + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, o, false) + batch = nil + } + o <- req + continue + } + found := false + uid, _ := ggraph.driver.GetID(req.ID) + dkeyPrefix := key.DstEdgePrefix(uid) + for it.Seek(dkeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), dkeyPrefix); it.Next() { + _, suid, _, label := key.DstEdgeKeyParse(it.Key()) + if len(edgeLabels) == 0 || setcmp.ContainsString(edgeLabels, label) { + src, _ := ggraph.driver.TranslateID(suid) + if !load { + req.Vertex = &gdbi.Vertex{ID: src, Label: labelFromElementID(src)} + o <- req + } else { + batch = append(batch, gdbi.ElementLookup{ID: src, Ref: req.Ref, Priv: lookupPriv{uid: suid}}) + if len(batch) >= 1000 { + ggraph.resolveBatch(ctx, batch, o, false) + batch = nil + } + } + found = true + } + } + + if !found && emitNull { + req.Vertex = nil + o <- req + } + } + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, o, false) + } + return nil + }) + }() + return o +} + +// GetOutEdgeChannel process requests of vertex ids and find the connected outgoing edges +func (ggraph *Graph) GetOutEdgeChannel(ctx context.Context, reqChan chan gdbi.ElementLookup, load bool, emitNull bool, edgeLabels []string) chan gdbi.ElementLookup { + o := make(chan gdbi.ElementLookup, 100) + go func() { + defer close(o) + ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + var batch []gdbi.ElementLookup + for req := range reqChan { + if req.IsSignal() { + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, o, true) + batch = nil + } + o <- req + continue + } + found := false + uid, _ := ggraph.driver.GetID(req.ID) + skeyPrefix := key.SrcEdgePrefix(uid) + for it.Seek(skeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), skeyPrefix); it.Next() { + euid, suid, duid, label := key.SrcEdgeKeyParse(it.Key()) + if len(edgeLabels) == 0 || setcmp.ContainsString(edgeLabels, label) { + eid, _ := ggraph.driver.TranslateID(euid) + src, _ := ggraph.driver.TranslateID(suid) + dst, _ := ggraph.driver.TranslateID(duid) + + byteVal, _ := it.Value() + _, loc, data := benchtop.DecodeEdgeValue(byteVal) + e := gdbi.Edge{ + From: src, + To: dst, + Label: label, + ID: eid, + } + if data != nil { + e.Data = data + e.Loaded = true + } + if !load { + if e.Data == nil { + e.Data = map[string]any{} + } + req.Edge = &e + o <- req + } else { + batch = append(batch, gdbi.ElementLookup{ID: eid, Ref: req.Ref, Edge: &e, Priv: lookupPriv{loc: loc, data: e.Data}}) + if len(batch) >= 1000 { + ggraph.resolveBatch(ctx, batch, o, true) + batch = nil + } + } + found = true + } + } + + if !found && emitNull { + req.Edge = nil + o <- req + } + } + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, o, true) + } + return nil + }) + }() + return o +} + +// GetInEdgeChannel process requests of vertex ids and find the connected incoming edges +func (ggraph *Graph) GetInEdgeChannel(ctx context.Context, reqChan chan gdbi.ElementLookup, load bool, emitNull bool, edgeLabels []string) chan gdbi.ElementLookup { + o := make(chan gdbi.ElementLookup, 100) + go func() { + defer close(o) + ggraph.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + var batch []gdbi.ElementLookup + for req := range reqChan { + if req.IsSignal() { + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, o, true) + batch = nil + } + o <- req + continue + } + found := false + uid, _ := ggraph.driver.GetID(req.ID) + dkeyPrefix := key.DstEdgePrefix(uid) + for it.Seek(dkeyPrefix); it.Valid() && bytes.HasPrefix(it.Key(), dkeyPrefix); it.Next() { + euid, suid, duid, label := key.DstEdgeKeyParse(it.Key()) + if len(edgeLabels) == 0 || setcmp.ContainsString(edgeLabels, label) { + eid, _ := ggraph.driver.TranslateID(euid) + src, _ := ggraph.driver.TranslateID(suid) + dst, _ := ggraph.driver.TranslateID(duid) + + byteVal, _ := it.Value() + _, loc, data := benchtop.DecodeEdgeValue(byteVal) + e := gdbi.Edge{ + From: src, + To: dst, + Label: label, + ID: eid, + } + if data != nil { + e.Data = data + e.Loaded = true + } + if !load { + if e.Data == nil { + e.Data = map[string]any{} + } + req.Edge = &e + o <- req + } else { + batch = append(batch, gdbi.ElementLookup{ID: eid, Ref: req.Ref, Edge: &e, Priv: lookupPriv{loc: loc, data: e.Data}}) + if len(batch) >= 1000 { + ggraph.resolveBatch(ctx, batch, o, true) + batch = nil + } + } + found = true + } + } + + if !found && emitNull { + req.Edge = nil + o <- req + } + } + if len(batch) > 0 { + ggraph.resolveBatch(ctx, batch, o, true) + } + return nil + }) + }() + return o +} diff --git a/grids/graphdb.go b/grids/graphdb.go index 14b7a637..8763670e 100644 --- a/grids/graphdb.go +++ b/grids/graphdb.go @@ -13,18 +13,20 @@ import ( // GridsGDB implements the GripInterface using a generic key/value storage driver type GDB struct { - basePath string - drivers map[string]*Graph + conf Config + drivers map[string]*Graph + mu sync.Mutex } // NewKVGraphDB intitalize a new grids graph driver -func NewGraphDB(baseDir string) (gdbi.GraphDB, error) { +func NewGraphDB(conf Config) (gdbi.GraphDB, error) { + conf.SetDefaults() log.Redf("Disclaimer: the Grids driver is an experimental database driver. Use with caution.") - _, err := os.Stat(baseDir) + _, err := os.Stat(conf.GraphDir) if os.IsNotExist(err) { - os.Mkdir(baseDir, 0700) + os.Mkdir(conf.GraphDir, 0700) } - return &GDB{basePath: baseDir, drivers: map[string]*Graph{}}, nil + return &GDB{conf: conf, drivers: map[string]*Graph{}}, nil } // Graph obtains the gdbi.DBI for a particular graph @@ -33,24 +35,23 @@ func (kgraph *GDB) Graph(graph string) (gdbi.GraphInterface, error) { if err != nil { return nil, err } - mu := sync.Mutex{} - mu.Lock() + kgraph.mu.Lock() g, ok := kgraph.drivers[graph] - mu.Unlock() + kgraph.mu.Unlock() if ok { return g, nil } - dbPath := filepath.Join(kgraph.basePath, graph) + dbPath := filepath.Join(kgraph.conf.GraphDir, graph) if _, err := os.Stat(dbPath); err == nil { // This also fetches an existing graph if it doesn't exist in kgraph.drivers - g, err := getGraph(kgraph.basePath, graph) + g, err := getGraph(kgraph.conf, graph) if err != nil { return nil, err } - mu.Lock() + kgraph.mu.Lock() kgraph.drivers[graph] = g - mu.Unlock() + kgraph.mu.Unlock() return g, nil } @@ -60,7 +61,7 @@ func (kgraph *GDB) Graph(graph string) (gdbi.GraphInterface, error) { // ListGraphs lists the graphs managed by this driver func (gdb *GDB) ListGraphs() []string { out := []string{} - if ds, err := filepath.Glob(filepath.Join(gdb.basePath, "*")); err == nil { + if ds, err := filepath.Glob(filepath.Join(gdb.conf.GraphDir, "*")); err == nil { for _, d := range ds { fi, err := os.Stat(d) if err != nil { diff --git a/grids/index.go b/grids/index.go index 5cce426c..5174810f 100644 --- a/grids/index.go +++ b/grids/index.go @@ -5,6 +5,7 @@ import ( "fmt" "github.com/bmeg/benchtop" + "github.com/bmeg/grip/grids/key" "github.com/bmeg/grip/gripql" "github.com/bmeg/grip/log" "github.com/cockroachdb/pebble" @@ -14,13 +15,43 @@ import ( // AddVertexIndex add index to vertices func (ggraph *Graph) AddVertexIndex(label, field string) error { log.WithFields(log.Fields{"label": label, "field": field}).Info("Adding vertex index") - return ggraph.jsonkv.AddField(VTABLE_PREFIX+label, field) + tableLabel := key.VertexTablePrefix + label + id, err := ggraph.driver.TableDr.LookupTableID(tableLabel) + if err != nil { + // Attempt to create the table if it doesn't exist + if _, err := ggraph.driver.New(tableLabel, nil); err != nil { + return fmt.Errorf("AddVertexIndex: failed to create table %s: %v", tableLabel, err) + } + // Lookup again + id, err = ggraph.driver.TableDr.LookupTableID(tableLabel) + if err != nil { + return fmt.Errorf("AddVertexIndex: table lookup failed after creation %s: %v", tableLabel, err) + } + } + if table, err := ggraph.driver.GetOrLoadTable(tableLabel); err == nil && table != nil { + if _, ok := table.Fields[field]; ok { + log.WithFields(log.Fields{"label": label, "field": field}).Debug("Vertex index already present; skipping") + return nil + } + } + return ggraph.driver.AddField(id, field) } // DeleteVertexIndex delete index from vertices func (ggraph *Graph) DeleteVertexIndex(label, field string) error { log.WithFields(log.Fields{"label": label, "field": field}).Info("Deleting vertex index") - return ggraph.jsonkv.RemoveField(VTABLE_PREFIX+label, field) + tableLabel := key.VertexTablePrefix + label + if table, err := ggraph.driver.GetOrLoadTable(tableLabel); err == nil && table != nil { + if _, ok := table.Fields[field]; !ok { + log.WithFields(log.Fields{"label": label, "field": field}).Debug("Vertex index missing; skipping delete") + return nil + } + } + id, err := ggraph.driver.TableDr.LookupTableID(tableLabel) + if err != nil { + return err + } + return ggraph.driver.RemoveField(id, field) } // GetVertexIndexList lists out all the vertex indices for a graph @@ -29,8 +60,12 @@ func (ggraph *Graph) GetVertexIndexList() <-chan *gripql.IndexID { out := make(chan *gripql.IndexID) go func() { defer close(out) - for _, f := range ggraph.jsonkv.ListFields() { - out <- &gripql.IndexID{Graph: ggraph.graphID, Label: f.Label, Field: f.Field} + for _, f := range ggraph.driver.ListFields() { + label := f.Label + if len(label) > 2 && label[:2] == key.VertexTablePrefix { + label = label[2:] + } + out <- &gripql.IndexID{Graph: ggraph.graphID, Label: label, Field: f.Field} } }() return out @@ -39,57 +74,73 @@ func (ggraph *Graph) GetVertexIndexList() <-chan *gripql.IndexID { // VertexLabelScan produces a channel of all vertex ids in a graph // that match a given label func (ggraph *Graph) VertexLabelScan(ctx context.Context, label string) chan string { - if label[:2] != VTABLE_PREFIX { - label = VTABLE_PREFIX + label + if len(label) < 2 || label[:2] != key.VertexTablePrefix { + label = key.VertexTablePrefix + label } log.WithFields(log.Fields{"label": label}).Info("Running VertexLabelScan") - return ggraph.jsonkv.GetIDsForLabel(label) + return ggraph.driver.GetIDsForLabel(label) } -func (ggraph *Graph) DeleteAnyRow(id string, label string, edgeFlag bool) error { +func (ggraph *Graph) DeleteAnyRow(id string, label string, edgeFlag bool, loc *benchtop.RowLoc) error { var prefix string = "v_" if edgeFlag { prefix = "e_" } - loc, err := ggraph.jsonkv.LocCache.Get(context.Background(), id) - if err != nil { - return err + if loc == nil { + return fmt.Errorf("DeleteAnyRow: RowLoc is nil for %s", id) } tableLabel := prefix + label var bulkErr *multierror.Error - if table, exists := ggraph.jsonkv.Tables[tableLabel]; exists { + table, err := ggraph.driver.GetOrLoadTable(tableLabel) + hasTable := (err == nil && table != nil) + if hasTable { + // Verify lineage + if table.TableId != loc.TableId { + log.Warningf("table mismatch during delete of %s: index says %s (ID %d) but row loc says TableID %d; using loc TableID", id, tableLabel, table.TableId, loc.TableId) + // Use GetTableInfo instead of LabelLookup + if info, err := ggraph.driver.TableDr.GetTableInfo(loc.TableId); err == nil { + name := info.Name + // Ensure it is of the right type (v_ or e_) + if len(name) > 2 && name[:2] == prefix { + if realTable, err := ggraph.driver.GetOrLoadTable(name); err == nil { + table = realTable + } + } + } + } for field := range table.Fields { - if err := ggraph.jsonkv.DeleteRowField(tableLabel, field, id); err != nil { - log.Errorf("Failed to delete index for field '%s' in table '%s' for row '%s': %v", field, tableLabel, id, err) + if err := ggraph.driver.DeleteRowField(loc.TableId, field, id); err != nil { + log.Errorf("Failed to delete index for field '%s' in table ID %d for row '%s': %v", field, loc.TableId, id, err) bulkErr = multierror.Append(bulkErr, err) } } } - ggraph.jsonkv.PebbleLock.Lock() - defer ggraph.jsonkv.PebbleLock.Unlock() - - table, ok := ggraph.jsonkv.Tables[prefix+label] - if !ok { - bulkErr = multierror.Append(bulkErr, fmt.Errorf("table %s not found in jsonkv.Tables: %#v", prefix+label, ggraph.jsonkv.Tables)) - return bulkErr.ErrorOrNil() - } + ggraph.driver.PebbleLock.Lock() + defer ggraph.driver.PebbleLock.Unlock() bId := []byte(id) - err = ggraph.jsonkv.Pkv.Delete(benchtop.NewPosKey(table.TableId, bId), nil) + err = ggraph.driver.Pkv.Delete(benchtop.NewPosKey(loc.TableId, bId), nil) if err != nil { - return err + bulkErr = multierror.Append(bulkErr, err) } - err = table.DeleteRow(loc, bId) - if err != nil { - if err == pebble.ErrNotFound { - log.Debugf("Pebble not Found: % s", err) - return nil + + if hasTable { + err = table.DeleteRow(loc, bId) + if err != nil { + if err == pebble.ErrNotFound { + log.Debugf("Pebble not Found: %s", err) + } else { + bulkErr = multierror.Append(bulkErr, err) + } } - bulkErr = multierror.Append(bulkErr, err) + } else { + log.Warningf("table %s not found in driver.Tables during delete of row %s; skipping data storage deletion but continuing with index cleanup", tableLabel, id) } - ggraph.jsonkv.LocCache.Invalidate(id) + + // ggraph.driver.LocCache.Invalidate(id) // Gone + ggraph.driver.TableDr.InvalidateLoc(loc.TableId, id) return bulkErr.ErrorOrNil() } diff --git a/grids/integrated_key_test.go b/grids/integrated_key_test.go new file mode 100644 index 00000000..349c1a1f --- /dev/null +++ b/grids/integrated_key_test.go @@ -0,0 +1,109 @@ +package grids + +import ( + "fmt" + "testing" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/pebblebulk" + "github.com/bmeg/grip/gdbi" + "github.com/bmeg/grip/grids/key" +) + +func TestIntegratedKeyRowLoc(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + defer fmt.Println("Done") + + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("failed to create dbi: %v", err) + } + defer dbi.Close() + + if err := dbi.AddGraph("test"); err != nil { + t.Fatalf("failed to add graph: %v", err) + } + + gi, _ := dbi.Graph("test") + g := gi.(*Graph) + + // Add a vertex + vID := "v1" + vLabel := "Person" + err = g.AddVertex([]*gdbi.Vertex{ + { + ID: vID, + Label: vLabel, + Data: map[string]any{ + "name": "Alice", + }, + }, + }) + if err != nil { + t.Fatalf("AddVertex failed: %v", err) + } + + // Verify the key in Pebble directly + uvid, _ := g.driver.GetID(vID) + vkey := key.VertexKey(uvid) + err = g.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + val, err := it.Get(vkey) + if err != nil { + return err + } + label, loc := benchtop.DecodeVertexValue(val) + if label != vLabel { + t.Errorf("expected label %s, got %s", vLabel, label) + } + if loc == nil { + t.Errorf("RowLoc should not be nil in integrated vertex key") + } else { + t.Logf("Found RowLoc in key: %+v", loc) + } + return nil + }) + if err != nil { + t.Fatalf("Pebble View error: %v", err) + } + + // Add an edge + eID := "e1" + eLabel := "knows" + err = g.AddEdge([]*gdbi.Edge{ + { + ID: eID, + From: vID, + To: vID, + Label: eLabel, + Data: map[string]any{ + "since": 2020, + }, + }, + }) + if err != nil { + t.Fatalf("AddEdge failed: %v", err) + } + + // Verify edge key + ueid, _ := g.driver.GetID(eID) + ekey := key.EdgeKey(ueid, uvid, uvid, eLabel) + err = g.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + val, err := it.Get(ekey) + if err != nil { + return err + } + _, loc, _ := benchtop.DecodeEdgeValue(val) + if loc == nil { + t.Errorf("RowLoc should not be nil in integrated edge value") + } else { + t.Logf("Found RowLoc in edge value: %+v", loc) + } + return nil + }) + if err != nil { + t.Fatalf("Pebble View error: %v", err) + } +} diff --git a/grids/key/key.go b/grids/key/key.go new file mode 100644 index 00000000..43681e70 --- /dev/null +++ b/grids/key/key.go @@ -0,0 +1,204 @@ +package key + +import ( + "bytes" + "encoding/binary" + + "github.com/bmeg/benchtop" +) + +const ( + VertexTablePrefix = "v_" + EdgeTablePrefix = "e_" + EdgeKeySize = 1 + 8 + 8 + 8 +) + +// maxEdgeLabelLen limits the size of edge labels encoded into keys. +// This prevents integer overflow and excessively large allocations +// when computing key sizes that include len(label). +const maxEdgeLabelLen = 1 << 20 // 1 MiB + +var vertexPrefix = []byte(".") +var edgePrefix = []byte("-") +var srcEdgePrefix = []byte("<") +var dstEdgePrefix = []byte(">") + +// ID mapping prefixes (local to grids or using benchtop ones) +func StringToIDKey(s string) []byte { + return append([]byte{benchtop.IDMappingPrefix}, []byte(s)...) +} + +func IDToStringKey(id uint64) []byte { + out := make([]byte, 9) + out[0] = benchtop.RIDMappingPrefix + binary.BigEndian.PutUint64(out[1:], id) + return out +} +func ParseRIDKey(k []byte) uint64 { + return binary.BigEndian.Uint64(k[1:]) +} + +// VertexKey generates the key given a vertex uint64 ID +func VertexKey(id uint64) []byte { + out := make([]byte, 9) + out[0] = vertexPrefix[0] + binary.BigEndian.PutUint64(out[1:], id) + return out +} + +func VertexKeyParse(key []byte) uint64 { + return binary.BigEndian.Uint64(key[1:]) +} + +// VertexIntegratedParse parses both vertex ID from key and label/loc from value +func VertexIntegratedParse(k, v []byte) (id uint64, label string, locBytes []byte) { + id = VertexKeyParse(k) + idx := bytes.IndexByte(v, 0) + if idx < 0 { + return id, string(v), nil + } + label = string(v[:idx]) + locBytes = v[idx+1:] + return id, label, locBytes +} + +// IntegratedEdgeValueParse extracts RowLoc bytes from an edge index value +func IntegratedEdgeValueParse(v []byte) []byte { + if len(v) >= 12 { + return v + } + return nil +} + +// EdgeKeyPrefix returns the byte array prefix for a particular edge id (uint64) +func EdgeKeyPrefix(id uint64) []byte { + out := make([]byte, 9) + out[0] = edgePrefix[0] + binary.BigEndian.PutUint64(out[1:], id) + return out +} + +// SrcEdgePrefix returns a byte array prefix for all entries in the source +// edge index a particular vertex (the source vertex) +func SrcEdgePrefix(id uint64) []byte { + out := make([]byte, 9) + out[0] = srcEdgePrefix[0] + binary.BigEndian.PutUint64(out[1:], id) + return out +} + +// DstEdgePrefix returns a byte array prefix for all entries in the dest +// edge index a particular vertex (the dest vertex) +func DstEdgePrefix(id uint64) []byte { + out := make([]byte, 9) + out[0] = dstEdgePrefix[0] + binary.BigEndian.PutUint64(out[1:], id) + return out +} + +// EdgeKey takes the required components of an edge key and returns the byte array +func EdgeKey(id, src, dst uint64, label string) []byte { + // Format: E | id(8) | src(8) | dst(8) | label(var) + if len(label) > maxEdgeLabelLen { + // Truncate excessively long labels to avoid overflow and huge allocations. + label = label[:maxEdgeLabelLen] + } + totalSize := int64(EdgeKeySize) + int64(len(label)) + out := make([]byte, totalSize) + out[0] = edgePrefix[0] + binary.BigEndian.PutUint64(out[1:], id) + binary.BigEndian.PutUint64(out[9:], src) + binary.BigEndian.PutUint64(out[17:], dst) + copy(out[25:], label) + return out +} + +func EdgeKeyParse(key []byte) (eid uint64, sid uint64, did uint64, label string) { + eid = binary.BigEndian.Uint64(key[1:9]) + sid = binary.BigEndian.Uint64(key[9:17]) + did = binary.BigEndian.Uint64(key[17:25]) + label = string(key[25:]) + return +} + +// SrcEdgeKey creates a src edge index key +func SrcEdgeKey(eid, src, dst uint64, label string) []byte { + // Format: < | src(8) | dst(8) | id(8) | label(var) + if len(label) > maxEdgeLabelLen { + label = label[:maxEdgeLabelLen] + } + totalSize := int64(EdgeKeySize) + int64(len(label)) + out := make([]byte, totalSize) + out[0] = srcEdgePrefix[0] + binary.BigEndian.PutUint64(out[1:], src) + binary.BigEndian.PutUint64(out[9:], dst) + binary.BigEndian.PutUint64(out[17:], eid) + copy(out[25:], label) + return out +} + +func SrcEdgeKeyParse(key []byte) (eid uint64, sid uint64, did uint64, label string) { + sid = binary.BigEndian.Uint64(key[1:9]) + did = binary.BigEndian.Uint64(key[9:17]) + eid = binary.BigEndian.Uint64(key[17:25]) + label = string(key[25:]) + return +} + +// DstEdgeKey creates a dest edge index key +func DstEdgeKey(eid, src, dst uint64, label string) []byte { + // Format: > | dst(8) | src(8) | id(8) | label(var) + if len(label) > maxEdgeLabelLen { + label = label[:maxEdgeLabelLen] + } + totalSize := int64(EdgeKeySize) + int64(len(label)) + out := make([]byte, totalSize) + out[0] = dstEdgePrefix[0] + binary.BigEndian.PutUint64(out[1:], dst) + binary.BigEndian.PutUint64(out[9:], src) + binary.BigEndian.PutUint64(out[17:], eid) + copy(out[25:], label) + return out +} + +func DstEdgeKeyParse(key []byte) (eid uint64, sid uint64, did uint64, label string) { + did = binary.BigEndian.Uint64(key[1:9]) + sid = binary.BigEndian.Uint64(key[9:17]) + eid = binary.BigEndian.Uint64(key[17:25]) + label = string(key[25:]) + return +} + +// VertexListPrefix returns a byte array prefix for all vertices in a graph +func VertexListPrefix() []byte { + return bytes.Join([][]byte{ + vertexPrefix, + {}, + }, []byte{0}) +} + +// EdgeListPrefix returns a byte array prefix for all edges in a graph +func EdgeListPrefix() []byte { + return bytes.Join([][]byte{ + edgePrefix, + {}, + }, []byte{0}) +} + +// SrcEdgeListPrefix returns a byte array prefix for all entries in the source +// edge index for a graph +func SrcEdgeListPrefix() []byte { + return bytes.Join([][]byte{ + srcEdgePrefix, + {}, + }, []byte{0}) +} + +// DstEdgeListPrefix returns a byte array prefix for all entries in the dest +// edge index for a graph +func DstEdgeListPrefix() []byte { + return bytes.Join([][]byte{ + dstEdgePrefix, + {}, + }, []byte{0}) +} diff --git a/grids/keyindex.go b/grids/keyindex.go deleted file mode 100644 index 2eae94c0..00000000 --- a/grids/keyindex.go +++ /dev/null @@ -1,137 +0,0 @@ -package grids - -import ( - "bytes" -) - -var vertexPrefix = []byte(".") -var edgePrefix = []byte("-") -var srcEdgePrefix = []byte("<") -var dstEdgePrefix = []byte(">") - -var intSize = 10 - -// VertexKey generates the key given a vertexId -func VertexKey(id string) []byte { - return bytes.Join([][]byte{ - vertexPrefix, - []byte(id), - }, []byte{0}) -} - -func VertexKeyParse(key []byte) (id string) { - tmp := bytes.Split(key, []byte{0}) - return string(tmp[1]) -} - -// EdgeKeyPrefix returns the byte array prefix for a particular edge id -func EdgeKeyPrefix(id string) []byte { - return bytes.Join([][]byte{ - edgePrefix, - []byte(id), - {}, - }, []byte{0}) -} - -// SrcEdgePrefix returns a byte array prefix for all entries in the source -// edge index a particular vertex (the source vertex) -func SrcEdgePrefix(id string) []byte { - return bytes.Join([][]byte{ - srcEdgePrefix, - []byte(id), - {}, - }, []byte{0}) -} - -// DstEdgePrefix returns a byte array prefix for all entries in the dest -// edge index a particular vertex (the dest vertex) -func DstEdgePrefix(id string) []byte { - return bytes.Join([][]byte{ - dstEdgePrefix, - []byte(id), - {}, - }, []byte{0}) -} - -// EdgeKey takes the required components of an edge key and returns the byte array -func EdgeKey(id, src, dst, label string) []byte { - return bytes.Join([][]byte{ - edgePrefix, - []byte(id), - []byte(label), - []byte(src), - []byte(dst), - }, []byte{0}) -} - -func EdgeKeyParse(key []byte) (eid string, sid string, did string, label string) { - tmp := bytes.Split(key, []byte{0}) - return string(tmp[1]), string(tmp[3]), string(tmp[4]), string(tmp[2]) -} - -// SrcEdgeKey creates a src edge index key -func SrcEdgeKey(eid, src, dst, label string) []byte { - return bytes.Join([][]byte{ - srcEdgePrefix, - []byte(src), - []byte(dst), - []byte(eid), - []byte(label), - }, []byte{0}) -} - -func SrcEdgeKeyParse(key []byte) (eid string, sid string, did string, label string) { - tmp := bytes.Split(key, []byte{0}) - return string(tmp[3]), string(tmp[1]), string(tmp[2]), string(tmp[4]) -} - -// DstEdgeKey creates a dest edge index key -func DstEdgeKey(eid, src, dst, label string) []byte { - return bytes.Join([][]byte{ - dstEdgePrefix, - []byte(dst), - []byte(src), - []byte(eid), - []byte(label), - }, []byte{0}) -} - -func DstEdgeKeyParse(key []byte) (eid string, sid string, did string, label string) { - tmp := bytes.Split(key, []byte{0}) - return string(tmp[3]), string(tmp[2]), string(tmp[1]), string(tmp[4]) -} - - -// VertexListPrefix returns a byte array prefix for all vertices in a graph -func VertexListPrefix() []byte { - return bytes.Join([][]byte{ - vertexPrefix, - {}, - }, []byte{0}) -} - -// EdgeListPrefix returns a byte array prefix for all edges in a graph -func EdgeListPrefix() []byte { - return bytes.Join([][]byte{ - edgePrefix, - {}, - }, []byte{0}) -} - -// SrcEdgeListPrefix returns a byte array prefix for all entries in the source -// edge index for a graph -func SrcEdgeListPrefix() []byte { - return bytes.Join([][]byte{ - srcEdgePrefix, - {}, - }, []byte{0}) -} - -// DstEdgeListPrefix returns a byte array prefix for all entries in the dest -// edge index for a graph -func DstEdgeListPrefix() []byte { - return bytes.Join([][]byte{ - dstEdgePrefix, - {}, - }, []byte{0}) -} diff --git a/grids/lazy_ref.go b/grids/lazy_ref.go new file mode 100644 index 00000000..48721b1c --- /dev/null +++ b/grids/lazy_ref.go @@ -0,0 +1,38 @@ +package grids + +import ( + "sync" + + "github.com/bmeg/grip/gdbi" +) + +// lazyElementRef defers row hydration until Get() is called. +type lazyElementRef struct { + meta gdbi.DataElement + loadFn func() *gdbi.DataElement + + once sync.Once + elem *gdbi.DataElement +} + +func (l *lazyElementRef) Identity() *gdbi.DataElement { + m := l.meta + return &m +} + +func (l *lazyElementRef) Get() *gdbi.DataElement { + l.once.Do(func() { + if l.loadFn != nil { + l.elem = l.loadFn() + } + }) + if l.elem != nil { + return l.elem + } + m := l.meta + return &m +} + +func (l *lazyElementRef) Copy() gdbi.DataRef { + return l +} diff --git a/grids/new.go b/grids/new.go index 7a5b6001..c82d480a 100644 --- a/grids/new.go +++ b/grids/new.go @@ -5,10 +5,12 @@ import ( "fmt" "os" "path/filepath" + "strconv" "strings" "sync" + "time" - "github.com/bmeg/benchtop/jsontable" + "github.com/bmeg/grip/grids/driver" "github.com/bmeg/grip/gripql" "github.com/bmeg/grip/timestamp" ) @@ -17,15 +19,16 @@ import ( type Graph struct { graphID string - jsonkv *jsontable.JSONDriver - ts *timestamp.Timestamp - tempDeletedEdges map[string]struct{} - edgesMutex sync.Mutex + driver *driver.GridKVDriver + ts *timestamp.Timestamp + tempDeletedEdges map[string]struct{} + edgesMutex sync.Mutex + BulkLoaderWorkers int } // Close the connection func (g *Graph) Close() error { - g.jsonkv.Close() + g.driver.Close() return nil } @@ -35,53 +38,51 @@ func (kgraph *GDB) AddGraph(graph string) error { if err != nil { return err } - g, err := newGraph(kgraph.basePath, graph) + g, err := newGraph(kgraph.conf, graph) if err != nil { return err } + kgraph.mu.Lock() + defer kgraph.mu.Unlock() kgraph.drivers[graph] = g return nil } -func newGraph(baseDir, name string) (*Graph, error) { - dbPath := filepath.Join(baseDir, name) +func newGraph(conf Config, name string) (*Graph, error) { + dbPath := filepath.Join(conf.GraphDir, name) fmt.Printf("Creating new GRIDS graph %s\n", name) - // Create directory if it doesn't exist if _, err := os.Stat(dbPath); os.IsNotExist(err) { if err := os.Mkdir(dbPath, 0700); err != nil { return nil, fmt.Errorf("failed to create directory %s: %v", dbPath, err) } } - // Create VERSION file versionPath := filepath.Join(dbPath, "VERSION") if err := os.WriteFile(versionPath, []byte("0.0.1"), 0644); err != nil { return nil, fmt.Errorf("failed to create VERSION file: %v", err) } - //bsonkvPath := fmt.Sprintf("%s", dbPath) - jsonkvPath := dbPath - tabledr, err := jsontable.NewJSONDriver(jsonkvPath) + drvr, err := openGridKVDriverWithRetry(conf, dbPath) if err != nil { - return nil, fmt.Errorf("failed to open jsonkv at %s: %v", jsonkvPath, err) + return nil, fmt.Errorf("failed to open grids storage at %s: %v", dbPath, err) } - jsonkv := tabledr.(*jsontable.JSONDriver) ts := timestamp.NewTimestamp() o := &Graph{ - jsonkv: jsonkv, - ts: &ts, - graphID: name, - tempDeletedEdges: make(map[string]struct{}), - edgesMutex: sync.Mutex{}, + driver: drvr, + ts: &ts, + graphID: name, + tempDeletedEdges: make(map[string]struct{}), + edgesMutex: sync.Mutex{}, + BulkLoaderWorkers: conf.BulkLoaderWorkers, } return o, nil } -func getGraph(baseDir, name string) (*Graph, error) { - dbPath := filepath.Join(baseDir, name) +func getGraph(conf Config, name string) (*Graph, error) { + dbPath := filepath.Join(conf.GraphDir, name) fmt.Printf("fetching GRIDS graph %s\n", name) versionPath := filepath.Join(dbPath, "VERSION") @@ -95,29 +96,24 @@ func getGraph(baseDir, name string) (*Graph, error) { if scanner.Scan() { version := scanner.Text() if strings.TrimSpace(version) != "0.0.1" { - return nil, fmt.Errorf("VERSION file at %s does not have '0.0.1' on the first line", versionPath) + return nil, fmt.Errorf("unsupported version %s", version) } - } else { - return nil, fmt.Errorf("VERSION file at %s is empty", versionPath) } - if err := scanner.Err(); err != nil { - return nil, fmt.Errorf("error reading VERSION file at %s: %v", versionPath, err) - } - - jsonkvPath := dbPath - tabledr, err := jsontable.LoadJSONDriver(jsonkvPath) + drvr, err := openGridKVDriverWithRetry(conf, dbPath) if err != nil { - return nil, fmt.Errorf("failed to open bsonkv at %s: %v", jsonkvPath, err) + return nil, fmt.Errorf("failed to open grids storage at %s: %v", dbPath, err) } - jsonkv := tabledr.(*jsontable.JSONDriver) - ts := timestamp.NewTimestamp() + o := &Graph{ - jsonkv: jsonkv, - ts: &ts, - graphID: name, + driver: drvr, + ts: &ts, + graphID: name, + tempDeletedEdges: make(map[string]struct{}), + edgesMutex: sync.Mutex{}, + BulkLoaderWorkers: conf.BulkLoaderWorkers, } return o, nil } @@ -130,11 +126,78 @@ func (kgraph *GDB) DeleteGraph(graph string) error { if err != nil { return nil } + kgraph.mu.Lock() + defer kgraph.mu.Unlock() if d, ok := kgraph.drivers[graph]; ok { d.Close() delete(kgraph.drivers, graph) } - dbPath := filepath.Join(kgraph.basePath, graph) + dbPath := filepath.Join(kgraph.conf.GraphDir, graph) os.RemoveAll(dbPath) return nil } + +func openGridKVDriverWithRetry(conf Config, dbPath string) (*driver.GridKVDriver, error) { + lockWaitSeconds := getenvInt("GRIDS_OPEN_LOCK_WAIT_SECONDS", 120) + retryMillis := getenvInt("GRIDS_OPEN_LOCK_RETRY_MILLIS", 1000) + if retryMillis <= 0 { + retryMillis = 1000 + } + + deadline := time.Now().Add(time.Duration(lockWaitSeconds) * time.Second) + attempt := 0 + for { + drvr, err := driver.NewGridKVDriver(dbPath, conf.Driver) + if err == nil { + if attempt > 0 { + fmt.Printf("GRIDS lock resolved path=%s attempts=%d\n", dbPath, attempt+1) + } + return drvr, nil + } + if !isLikelyFileLockError(err) || lockWaitSeconds <= 0 || time.Now().After(deadline) { + return nil, err + } + + attempt++ + if attempt == 1 || attempt%10 == 0 { + remaining := time.Until(deadline).Round(time.Second) + fmt.Printf("GRIDS lock wait path=%s attempt=%d remaining=%s err=%v\n", dbPath, attempt, remaining, err) + } + time.Sleep(time.Duration(retryMillis) * time.Millisecond) + } +} + +func isLikelyFileLockError(err error) bool { + if err == nil { + return false + } + s := strings.ToLower(err.Error()) + // Some backends bubble lock contention as plain EAGAIN text without the + // word "lock", e.g. "resource temporarily unavailable". + if strings.Contains(s, "resource temporarily unavailable") || + strings.Contains(s, "database is locked") || + strings.Contains(s, "eagain") { + return true + } + if !strings.Contains(s, "lock") { + return false + } + return strings.Contains(s, "resource temporarily unavailable") || + strings.Contains(s, "held by") || + strings.Contains(s, "another process") || + strings.Contains(s, "is locked") || + strings.Contains(s, "cannot acquire") || + strings.Contains(s, "timeout") +} + +func getenvInt(key string, def int) int { + raw := strings.TrimSpace(os.Getenv(key)) + if raw == "" { + return def + } + v, err := strconv.Atoi(raw) + if err != nil { + return def + } + return v +} diff --git a/grids/optimizer.go b/grids/optimizer.go index 371c3010..552f9a3f 100644 --- a/grids/optimizer.go +++ b/grids/optimizer.go @@ -1,6 +1,7 @@ package grids import ( + "github.com/bmeg/grip/grids/key" "github.com/bmeg/grip/gripql" "github.com/bmeg/grip/util/protoutil" ) @@ -75,8 +76,8 @@ var startOptimizations = []OptimizationRule{ has := pipe[2].GetHas() labels := protoutil.AsStringList(pipe[1].GetHasLabel()) for i, label := range labels { - if label[:2] != VTABLE_PREFIX { - labels[i] = VTABLE_PREFIX + label + if label[:2] != key.VertexTablePrefix { + labels[i] = key.VertexTablePrefix + label } } var optimized = []*gripql.GraphStatement{ @@ -109,8 +110,8 @@ var startOptimizations = []OptimizationRule{ Replace: func(pipe []*gripql.GraphStatement) []*gripql.GraphStatement { labels := protoutil.AsStringList(pipe[1].GetHasLabel()) for i, label := range labels { - if label[:2] != VTABLE_PREFIX { - labels[i] = VTABLE_PREFIX + label + if label[:2] != key.VertexTablePrefix { + labels[i] = key.VertexTablePrefix + label } } var optimized = []*gripql.GraphStatement{ diff --git a/grids/persistence_delete_restart_test.go b/grids/persistence_delete_restart_test.go new file mode 100644 index 00000000..b70109b4 --- /dev/null +++ b/grids/persistence_delete_restart_test.go @@ -0,0 +1,261 @@ +package grids + +import ( + "bytes" + "strconv" + "testing" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/pebblebulk" + "github.com/bmeg/grip/gdbi" + "github.com/bmeg/grip/grids/key" +) + +func bulkAddElems(t *testing.T, g *Graph, elems []*gdbi.GraphElement) { + t.Helper() + ch := make(chan *gdbi.GraphElement, len(elems)) + for _, e := range elems { + ch <- e + } + close(ch) + if err := g.BulkAdd(ch); err != nil { + t.Fatalf("BulkAdd failed: %v", err) + } +} + +func countPosKeysForID(t *testing.T, kv *pebblebulk.PebbleKV, id string) int { + t.Helper() + count := 0 + err := kv.View(func(it *pebblebulk.PebbleIterator) error { + prefix := []byte{benchtop.PosPrefix} + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + _, rowID := benchtop.ParsePosKey(it.Key()) + if string(rowID) == id { + count++ + } + } + return nil + }) + if err != nil { + t.Fatalf("countPosKeysForID iterator error: %v", err) + } + return count +} + +func listPosKeysForID(t *testing.T, kv *pebblebulk.PebbleKV, id string) []string { + t.Helper() + out := []string{} + err := kv.View(func(it *pebblebulk.PebbleIterator) error { + prefix := []byte{benchtop.PosPrefix} + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + tid, rowID := benchtop.ParsePosKey(it.Key()) + if string(rowID) != id { + continue + } + val, err := it.Value() + if err != nil { + out = append(out, "value_error") + continue + } + loc := benchtop.DecodeRowLoc(val) + if loc == nil { + out = append(out, "tid="+strconv.Itoa(int(tid))+" loc=nil") + continue + } + out = append(out, "tid="+ + strconv.Itoa(int(tid))+ + " loc.table="+strconv.Itoa(int(loc.TableId))) + } + return nil + }) + if err != nil { + t.Fatalf("listPosKeysForID iterator error: %v", err) + } + return out +} + +func openGraphForTest(t *testing.T, conf Config, graph string) (*GDB, *Graph) { + t.Helper() + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + gi, err := dbi.Graph(graph) + if err != nil { + dbi.Close() + t.Fatalf("Graph(%s) failed: %v", graph, err) + } + return dbi.(*GDB), gi.(*Graph) +} + +func TestDeletePersistsAcrossRestart(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + const graphName = "g" + + // First run: create graph and load data. + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + if err := dbi.AddGraph(graphName); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph(graphName) + if err != nil { + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + elems := make([]*gdbi.GraphElement, 0, 10) + for i := 0; i < 10; i++ { + id := "obs:" + string(rune('a'+i)) + elems = append(elems, &gdbi.GraphElement{ + Vertex: &gdbi.Vertex{ + ID: id, + Label: "Observation", + Data: map[string]any{ + "status": "final", + "n": i, + }, + }, + }) + } + bulkAddElems(t, g, elems) + dbi.Close() + + // Restart, then delete a subset. + dbi2, g2 := openGraphForTest(t, conf, graphName) + for _, id := range []string{"obs:a", "obs:b", "obs:c", "obs:d", "obs:e"} { + var found bool + var label string + err := g2.driver.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + uid, _ := g2.driver.GetID(id) + vk := key.VertexKey(uid) + if err := it.Seek(vk); err != nil { + return err + } + if it.Valid() && bytes.Equal(it.Key(), vk) { + v, err := it.Value() + if err != nil { + return err + } + found = true + label, _ = benchtop.DecodeVertexValue(v) + } + return nil + }) + if err != nil { + dbi2.Close() + t.Fatalf("pre-delete vertex key check failed for %s: %v", id, err) + } + if !found || label == "" { + dbi2.Close() + t.Fatalf("pre-delete vertex key missing/empty for %s: found=%v label=%q", id, found, label) + } + } + del := &gdbi.DeleteData{ + Graph: graphName, + Vertices: []string{"obs:a", "obs:b", "obs:c", "obs:d", "obs:e"}, + } + if err := g2.BulkDel(del); err != nil { + dbi2.Close() + t.Fatalf("BulkDel failed: %v", err) + } + for _, id := range del.Vertices { + if c := countPosKeysForID(t, g2.driver.Pkv, id); c != 0 { + t.Fatalf("post-delete pre-restart expected 0 pos keys for %s, got %d (%v)", id, c, listPosKeysForID(t, g2.driver.Pkv, id)) + } + } + dbi2.Close() + + // Restart again and verify both logical and persisted location state. + dbi3, g3 := openGraphForTest(t, conf, graphName) + defer dbi3.Close() + + // Deleted IDs should not resolve or exist as vertices. + for _, id := range []string{"obs:a", "obs:b", "obs:c", "obs:d", "obs:e"} { + if v := g3.GetVertex(id, false); v != nil { + t.Fatalf("expected deleted vertex %s to be absent, got %#v", id, v) + } + } + + // Non-deleted IDs should still exist. + for _, id := range []string{"obs:f", "obs:g", "obs:h", "obs:i", "obs:j"} { + if v := g3.GetVertex(id, false); v == nil { + t.Fatalf("expected surviving vertex %s to exist", id) + } + } + +} + +func TestBulkDelReingestedIDsDeleteAgain(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + const graphName = "g" + + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + defer dbi.Close() + if err := dbi.AddGraph(graphName); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph(graphName) + if err != nil { + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + ids := []string{"obs:a", "obs:b", "obs:c"} + load := func() { + elems := make([]*gdbi.GraphElement, 0, len(ids)) + for _, id := range ids { + elems = append(elems, &gdbi.GraphElement{ + Vertex: &gdbi.Vertex{ + ID: id, + Label: "Observation", + Data: map[string]any{"status": "final"}, + }, + }) + } + bulkAddElems(t, g, elems) + } + + del := &gdbi.DeleteData{ + Graph: graphName, + Vertices: append([]string(nil), ids...), + } + + // First load/delete should succeed. + load() + if err := g.BulkDel(del); err != nil { + t.Fatalf("first BulkDel failed: %v", err) + } + for _, id := range ids { + if v := g.GetVertex(id, false); v != nil { + t.Fatalf("expected %s deleted after first BulkDel", id) + } + } + + // Re-load the same IDs and delete again; this must always execute and delete. + load() + for _, id := range ids { + if v := g.GetVertex(id, false); v == nil { + t.Fatalf("expected %s to exist after reload", id) + } + } + if err := g.BulkDel(del); err != nil { + t.Fatalf("second BulkDel failed: %v", err) + } + for _, id := range ids { + if v := g.GetVertex(id, false); v != nil { + t.Fatalf("expected %s deleted after second BulkDel", id) + } + } +} diff --git a/grids/processor.go b/grids/processor.go index 1f8472a5..3a27cf9e 100644 --- a/grids/processor.go +++ b/grids/processor.go @@ -2,8 +2,14 @@ package grids import ( "context" + "strings" + "time" + "github.com/bmeg/benchtop" "github.com/bmeg/grip/gdbi" + "github.com/bmeg/grip/grids/driver" + "github.com/bmeg/grip/grids/filter" + "github.com/bmeg/grip/grids/key" "github.com/bmeg/grip/gripql" "github.com/bmeg/grip/log" ) @@ -12,18 +18,20 @@ import ( // LookupVertexHasLabelCondIndex look up vertices has label type lookupVertsHasLabelCondIndexStep struct { - labels []string - expr *gripql.HasExpression - loadData bool + labels []string + expr *gripql.HasExpression + loadData bool + projectedFields []string } func (t lookupVertsHasLabelCondIndexStep) GetProcessor(db gdbi.GraphInterface, ps gdbi.PipelineState) (gdbi.Processor, error) { graph := db.(*Graph) return &lookupVertsHasLabelCondIndexProc{ - db: graph, - expr: t.expr, - labels: t.labels, - loadData: ps.StepLoadData(), + db: graph, + expr: t.expr, + labels: t.labels, + loadData: ps.StepLoadData(), + projectedFields: normalizeProjectedFields(ps.StepRequiredFields()), }, nil } @@ -33,20 +41,198 @@ func (t lookupVertsHasLabelCondIndexStep) GetType() gdbi.DataType { } type lookupVertsHasLabelCondIndexProc struct { - db *Graph - labels []string - expr *gripql.HasExpression - loadData bool + db *Graph + labels []string + expr *gripql.HasExpression + loadData bool + projectedFields []string +} + +func normalizeProjectedFields(in []string) []string { + if len(in) == 0 { + return nil + } + out := []string{} + seen := map[string]struct{}{} + for _, f := range in { + if f == "" || f == "*" { + return nil + } + if strings.HasPrefix(f, "$") { + // keep current-step top-level paths only + if strings.HasPrefix(f, "$.") { + f = strings.TrimPrefix(f, "$.") + } else if strings.HasPrefix(f, "$_current.") { + f = strings.TrimPrefix(f, "$_current.") + } else { + continue + } + } + if strings.Contains(f, ".") || strings.Contains(f, "[") { + continue + } + if _, ok := seen[f]; ok { + continue + } + seen[f] = struct{}{} + out = append(out, f) + } + return out +} + +func emitIndexedVertexBatches(ctx context.Context, table benchtop.TableStore, traveler gdbi.Traveler, label string, fields []string, in <-chan benchtop.Index, out gdbi.OutPipe) int { + locs := make([]*benchtop.RowLoc, 0, resolveBatchSize) + ids := make([]string, 0, resolveBatchSize) + total := 0 + + flush := func() bool { + if len(locs) == 0 { + return true + } + rows, errs := table.GetRows(locs) + for i := range rows { + if i >= len(errs) || errs[i] != nil { + continue + } + v := gdbi.Vertex{ + ID: ids[i], + Label: label, + Data: projectRowMap(rows[i], fields), + Loaded: true, + } + select { + case <-ctx.Done(): + return false + case out <- traveler.AddCurrent(&v): + } + total++ + } + locs = locs[:0] + ids = ids[:0] + return true + } + + for entry := range in { + if ctx.Err() != nil { + return total + } + if entry.Loc == nil { + continue + } + locs = append(locs, entry.Loc) + ids = append(ids, string(entry.Key)) + if len(locs) >= resolveBatchSize { + if !flush() { + return total + } + } + } + flush() + return total +} + +func emitIndexedVertexBatchesAnyTable(ctx context.Context, g *Graph, traveler gdbi.Traveler, fields []string, in <-chan benchtop.Index, out gdbi.OutPipe) int { + type tableBatch struct { + table *driver.BackendTable + locs []*benchtop.RowLoc + ids []string + } + batches := map[uint16]*tableBatch{} + pending := 0 + total := 0 + + flushAll := func() bool { + if pending == 0 { + return true + } + for _, b := range batches { + if b == nil || b.table == nil || len(b.locs) == 0 { + continue + } + rows, errs := b.table.GetRows(b.locs) + for i := range rows { + if i >= len(errs) || errs[i] != nil { + continue + } + v := gdbi.Vertex{ + ID: b.ids[i], + Label: b.table.Label, + Data: projectRowMap(rows[i], fields), + Loaded: true, + } + select { + case <-ctx.Done(): + return false + case out <- traveler.AddCurrent(&v): + } + total++ + } + b.locs = b.locs[:0] + b.ids = b.ids[:0] + } + pending = 0 + return true + } + + for entry := range in { + if ctx.Err() != nil { + return total + } + if entry.Loc == nil || len(entry.Key) == 0 { + continue + } + tid := entry.Loc.TableId + b, ok := batches[tid] + if !ok { + t, err := g.driver.GetTableByID(tid) + if err != nil { + continue + } + b = &tableBatch{ + table: t, + locs: make([]*benchtop.RowLoc, 0, 1024), + ids: make([]string, 0, 1024), + } + batches[tid] = b + } + b.locs = append(b.locs, entry.Loc) + b.ids = append(b.ids, string(entry.Key)) + pending++ + if pending >= resolveBatchSize { + if !flushAll() { + return total + } + } + } + flushAll() + return total } func (l *lookupVertsHasLabelCondIndexProc) Process(ctx context.Context, man gdbi.Manager, in gdbi.InPipe, out gdbi.OutPipe) context.Context { + loadData := l.loadData var exists = true // Here if one of l.labels doesn't exist then not going to be querying all the data so leave it like this. cond := l.expr.GetCondition() + // If condition is simple, we check if field is indexed. + // But how to check without loading all tables? + // We iterate labels. For each label, resolve ID, get table, check if field is indexed. if cond != nil { - for _, iterLabel := range l.labels { - tabel, ok := l.db.jsonkv.Tables[iterLabel] + for _, label := range l.labels { + tID, err := l.db.driver.TableDr.LookupTableID(label) + if err != nil { + exists = false + break + } + l.db.driver.Lock.RLock() + tabel, ok := l.db.driver.TablesByID[tID] + l.db.driver.Lock.RUnlock() if !ok { + // Table loaded? + // If not loaded, we don't know if field is indexed. + // But fields are loaded at startup. So if table not in Tables, maybe fields are not loaded. + // driver.Tables should contain all tables with fields? + // driver.LoadFields() populates Tables for any table with fields. + // So if not in Tables, implies no fields indexed? exists = false break } @@ -57,63 +243,146 @@ func (l *lookupVertsHasLabelCondIndexProc) Process(ctx context.Context, man gdbi } } count := 0 - if !exists || (l.expr == nil && cond == nil) { - log.Debugln("Using base case processor lookupVertsHasLabelCondIndexProc") + if l.expr == nil && cond == nil { + log.Debugln("Using live-id base case processor lookupVertsHasLabelCondIndexProc") go func() { defer close(out) for t := range in { for _, label := range l.labels { - tableFound, ok := l.db.jsonkv.Tables[label] - if !ok { - log.Debugf("BSONTable for label '%s' is nil. Cannot scan.", label) - continue + if ctx.Err() != nil { + return } - if l.loadData { - for roMaps := range tableFound.ScanDoc(&GripQLFilter{Expression: l.expr}) { - v := gdbi.Vertex{ - Label: label[2:], - Loaded: l.loadData, - ID: roMaps["_id"].(string), - } - delete(roMaps, "_id") - v.Data = roMaps - count += 1 - out <- t.AddCurrent(v.Copy()) - } - } else { - for roMaps := range tableFound.ScanId(&GripQLFilter{Expression: l.expr}) { + labelName := strings.TrimPrefix(label, key.VertexTablePrefix) + if !loadData { + for idx := range l.db.driver.RowLocsByLabel(labelName) { v := gdbi.Vertex{ - Label: label[2:], - Loaded: l.loadData, - ID: roMaps, + ID: string(idx.Key), + Label: labelName, Data: map[string]any{}, + Loaded: false, } - count += 1 - out <- t.AddCurrent(v.Copy()) + out <- t.AddCurrent(&v) } + continue + } + tableFound, err := l.db.driver.GetOrLoadTable(label) + if err != nil { + log.Debugf("Table for label '%s' not found: %v", label, err) + continue + } + emitIndexedVertexBatches( + ctx, + tableFound, + t, + labelName, + l.projectedFields, + l.db.driver.RowLocsByLabel(labelName), + out, + ) + } + } + }() + return ctx + } + + if !exists { + log.Debugln("Using live fallback processor lookupVertsHasLabelCondIndexProc") + go func() { + defer close(out) + cond := l.expr.GetCondition() + for t := range in { + for _, label := range l.labels { + tableFound, err := l.db.driver.GetOrLoadTable(label) + if err != nil { + log.Debugf("Table for label '%s' not found: %v", label, err) + continue + } + labelName := strings.TrimPrefix(label, key.VertexTablePrefix) + if loadData { + count += emitIndexedVertexBatches( + ctx, + tableFound, + t, + labelName, + l.projectedFields, + l.db.driver.RowIdsByLabelFieldValue( + labelName, + cond.Key, + cond.Value.AsInterface(), + filter.ToQueryCondition(cond.Condition), + ), + out, + ) + continue + } + for entry := range l.db.driver.RowIdsByLabelFieldValue( + labelName, + cond.Key, + cond.Value.AsInterface(), + filter.ToQueryCondition(cond.Condition), + ) { + v := gdbi.Vertex{ + Label: labelName, + Loaded: false, + ID: string(entry.Key), + Data: map[string]any{}, + } + count += 1 + out <- t.AddCurrent(&v) } } } }() } else { log.Debugln("Using optimized custom processor lookupVertsHasLabelCondIndexProc") + if loadData { + go func() { + defer close(out) + for t := range in { + if ctx.Err() != nil { + return + } + cond := l.expr.GetCondition() + for _, label := range l.labels { + tableFound, err := l.db.driver.GetOrLoadTable(label) + if err != nil { + continue + } + emitIndexedVertexBatches( + ctx, + tableFound, + t, + strings.TrimPrefix(label, key.VertexTablePrefix), + l.projectedFields, + l.db.driver.RowIdsByLabelFieldValue(label[2:], cond.Key, cond.Value.AsInterface(), filter.ToQueryCondition(cond.Condition)), + out, + ) + } + } + }() + return ctx + } queryChan := make(chan gdbi.ElementLookup, 100) go func() { defer close(queryChan) for t := range in { cond := l.expr.GetCondition() for _, label := range l.labels { - for id := range l.db.jsonkv.RowIdsByLabelFieldValue(label, cond.Key, cond.Value.AsInterface(), cond.Condition) { - queryChan <- gdbi.ElementLookup{ID: id, Ref: t} + for entry := range l.db.driver.RowIdsByLabelFieldValue(label[2:], cond.Key, cond.Value.AsInterface(), filter.ToQueryCondition(cond.Condition)) { + queryChan <- gdbi.ElementLookup{ + ID: string(entry.Key), + Ref: t, + Priv: lookupPriv{loc: entry.Loc, fields: l.projectedFields}, + } } } } }() go func() { defer close(out) - for v := range l.db.GetVertexChannel(ctx, queryChan, l.loadData) { + for v := range l.db.GetVertexChannel(ctx, queryChan, loadData) { i := v.Ref - out <- i.AddCurrent(v.Vertex.Copy()) + out <- i.AddCurrent(v.Vertex) } }() } @@ -129,9 +398,11 @@ type lookupVertsCondIndexStep struct { func (t lookupVertsCondIndexStep) GetProcessor(db gdbi.GraphInterface, ps gdbi.PipelineState) (gdbi.Processor, error) { graph := db.(*Graph) return &lookupVertsCondIndexProc{ - db: graph, - expr: t.expr, - loadData: ps.StepLoadData()}, nil + db: graph, + expr: t.expr, + loadData: ps.StepLoadData(), + projectedFields: normalizeProjectedFields(ps.StepRequiredFields()), + }, nil } func (t lookupVertsCondIndexStep) GetType() gdbi.DataType { @@ -139,77 +410,175 @@ func (t lookupVertsCondIndexStep) GetType() gdbi.DataType { } type lookupVertsCondIndexProc struct { - db *Graph - expr *gripql.HasExpression - loadData bool - fallback bool + db *Graph + expr *gripql.HasExpression + loadData bool + projectedFields []string + fallback bool } func (l *lookupVertsCondIndexProc) Process(ctx context.Context, man gdbi.Manager, in gdbi.InPipe, out gdbi.OutPipe) context.Context { + loadData := l.loadData log.Debugln("Entering lookupVertsCondIndexProc custom processor") cond := l.expr.GetCondition() - /* Indexing only works if every vertex label is indexed for that specific field and it's only a condition Filter - otherwise this lookup will not fetch everything that was asked for */ - allMatch := cond != nil - if allMatch { - for lbl := range l.db.jsonkv.GetLabels(false, false) { - if table, exists := l.db.jsonkv.Tables[lbl]; exists { - if _, ok := table.Fields[cond.Key]; !ok { - allMatch = false - break - } - } else { - allMatch = false - break - } - } - } - /* Optimized indexing only works for Simple filters. If compound filter or index doesn't exist, use backup method */ - if cond != nil && allMatch { + if cond != nil { log.Debugln("Chose index optimized V().Has() statement path") + vertexLabels := []string{} + for label := range l.db.driver.GetLabels(false, true) { + vertexLabels = append(vertexLabels, label) + } + + if loadData { + go func() { + defer close(out) + start := time.Now() + var produced int + for t := range in { + if ctx.Err() != nil { + return + } + produced += emitIndexedVertexBatchesAnyTable( + ctx, + l.db, + t, + l.projectedFields, + l.db.driver.RowIdsByLabelsFieldValue( + vertexLabels, + cond.Key, + cond.Value.AsInterface(), + filter.ToQueryCondition(cond.Condition), + ), + out, + ) + } + log.Debugf("lookupVertsCondIndexProc direct emit completed rows=%d elapsed=%s", produced, time.Since(start).Round(time.Millisecond)) + }() + return ctx + } + queryChan := make(chan gdbi.ElementLookup, 100) + + // Stream index matches per input traveler to avoid building large in-memory + // caches that can stall under backpressure. go func() { defer close(queryChan) + start := time.Now() + var travelers int + var totalMatches int for t := range in { - for id := range l.db.jsonkv.RowIdsByHas( - cond.Key, - cond.Value.AsInterface(), - cond.Condition, - ) { - queryChan <- gdbi.ElementLookup{ - ID: id, - Ref: t, + if ctx.Err() != nil { + return + } + travelers++ + matches := 0 + if len(vertexLabels) == 0 { + for entry := range l.db.driver.RowIdsByHas( + cond.Key, + cond.Value.AsInterface(), + filter.ToQueryCondition(cond.Condition), + ) { + e := gdbi.ElementLookup{ + ID: string(entry.Key), + Ref: t, + Priv: lookupPriv{loc: entry.Loc, fields: l.projectedFields}, + } + select { + case <-ctx.Done(): + return + case queryChan <- e: + } + matches++ + totalMatches++ + } + } else { + for entry := range l.db.driver.RowIdsByLabelsFieldValue( + vertexLabels, + cond.Key, + cond.Value.AsInterface(), + filter.ToQueryCondition(cond.Condition), + ) { + e := gdbi.ElementLookup{ + ID: string(entry.Key), + Ref: t, + Priv: lookupPriv{loc: entry.Loc, fields: l.projectedFields}, + } + select { + case <-ctx.Done(): + return + case queryChan <- e: + } + matches++ + totalMatches++ } } + log.Debugf("Index lookup streamed %d rows for traveler=%d", matches, travelers) } + log.Debugf("Index lookup completed travelers=%d totalMatches=%d elapsed=%s", travelers, totalMatches, time.Since(start).Round(time.Millisecond)) }() // Process queryChan with GetVertexChannel for indexed case go func() { defer close(out) - for v := range l.db.GetVertexChannel(ctx, queryChan, l.loadData) { + start := time.Now() + var produced int + for v := range l.db.GetVertexChannel(ctx, queryChan, loadData) { + if ctx.Err() != nil { + return + } + if v.Ref == nil || v.Vertex == nil { + continue + } i := v.Ref - out <- i.AddCurrent(v.Vertex.Copy()) + select { + case <-ctx.Done(): + return + case out <- i.AddCurrent(v.Vertex): + } + produced++ + if produced%10000 == 0 { + log.Debugf("lookupVertsCondIndexProc emit progress rows=%d elapsed=%s", produced, time.Since(start).Round(time.Millisecond)) + } } + log.Debugf("lookupVertsCondIndexProc emit completed rows=%d elapsed=%s", produced, time.Since(start).Round(time.Millisecond)) }() } else { log.Debugf("Base case GetVertexList is used. No indexing") go func() { defer close(out) for t := range in { - for tLabel, table := range l.db.jsonkv.Tables { - if tLabel[:2] == VTABLE_PREFIX { - for v := range table.ScanDoc(&GripQLFilter{Expression: l.expr}) { + for _, tLabel := range l.db.driver.List() { + if strings.HasPrefix(tLabel, key.VertexTablePrefix) { + table, err := l.db.driver.GetOrLoadTable(tLabel) + if err != nil { + continue + } + if !loadData { + for id := range table.ScanId(&filter.GripQLFilter{Expression: l.expr}) { + vertex := gdbi.Vertex{ + ID: id, + Label: strings.TrimPrefix(tLabel, key.VertexTablePrefix), + Data: map[string]any{}, + Loaded: false, + } + out <- t.AddCurrent(&vertex) + } + continue + } + filterExpr := &filter.GripQLFilter{Expression: l.expr} + stream := table.ScanDoc(filterExpr) + if len(l.projectedFields) > 0 { + stream = table.ScanDocProjected(l.projectedFields, filterExpr) + } + for v := range stream { vertex := gdbi.Vertex{ ID: v["_id"].(string), - Label: tLabel[len(VTABLE_PREFIX):], // Extract label from table name - Data: v, // Use full data from ScanDoc - Loaded: l.loadData, // Set Loaded based on l.loadData + Label: strings.TrimPrefix(tLabel, key.VertexTablePrefix), + Data: v, + Loaded: true, } - // Send directly to out channel - out <- t.AddCurrent(vertex.Copy()) + out <- t.AddCurrent(&vertex) } } } diff --git a/grids/reproduce_issue_test.go b/grids/reproduce_issue_test.go new file mode 100644 index 00000000..94a098bc --- /dev/null +++ b/grids/reproduce_issue_test.go @@ -0,0 +1,186 @@ +package grids + +import ( + "context" + "fmt" + "os" + "testing" + "time" + + "github.com/bmeg/grip/gdbi" +) + +// TestIssueRepro attempts to reproduce the issue where hasLabel returns duplicates after restart. +func TestIssueRepro(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + const graphName = "g" + const label = "Observation" + const numVertices = 100 + + // Helper to load data + loadData := func(t *testing.T, g *Graph, start int, count int) { + t.Helper() + elems := make([]*gdbi.GraphElement, 0, count) + for i := 0; i < count; i++ { + id := fmt.Sprintf("obs:%d", start+i) + elems = append(elems, &gdbi.GraphElement{ + Vertex: &gdbi.Vertex{ + ID: id, + Label: label, + Data: map[string]any{ + "status": "final", + "n": start + i, + }, + }, + }) + } + if err := g.BulkAdd(asChan(elems)); err != nil { + t.Fatalf("BulkAdd failed: %v", err) + } + } + + // 1. First run: create graph and load data. + t.Log("--- Run 1 ---") + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + if err := dbi.AddGraph(graphName); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph(graphName) + if err != nil { + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + loadData(t, g, 0, numVertices) + + // Query 1 + count1 := countLabel(t, g, label) + t.Logf("Run 1 Count: %d", count1) + if count1 != numVertices { + t.Errorf("Run 1: expected %d, got %d", numVertices, count1) + } + + // Delete some data (mimic user script) + // Deleting first 50 + toDelete := []string{} + for i := 0; i < 50; i++ { + toDelete = append(toDelete, fmt.Sprintf("obs:%d", i)) + } + delData := &gdbi.DeleteData{ + Graph: graphName, + Vertices: toDelete, + } + if err := g.BulkDel(delData); err != nil { + t.Fatalf("BulkDel failed: %v", err) + } + + count1b := countLabel(t, g, label) + t.Logf("Run 1 Post-Delete Count: %d", count1b) + if count1b != 50 { + t.Errorf("Run 1 Post-Delete: expected 50, got %d", count1b) + } + + // Reload data (mimic user script) + // Reloading same 100 vertices + loadData(t, g, 0, numVertices) + + count1c := countLabel(t, g, label) + t.Logf("Run 1 Post-Reload Count: %d", count1c) + if count1c != numVertices { + t.Errorf("Run 1 Post-Reload: expected %d, got %d", numVertices, count1c) + } + + dbi.Close() + + // 2. Restart and Query + t.Log("--- Run 2 (Restart) ---") + time.Sleep(100 * time.Millisecond) // Give it a moment + + dbi2, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB restart failed: %v", err) + } + gi2, err := dbi2.Graph(graphName) + if err != nil { + t.Fatalf("Graph restart failed: %v", err) + } + g2 := gi2.(*Graph) + + count2 := countLabel(t, g2, label) + t.Logf("Run 2 Count: %d", count2) + + // Also check for duplicates specifically + ids := getIDs(t, g2, label) + if hasDuplicates(ids) { + t.Errorf("Run 2 found duplicates in result IDs!") + dumpDuplicates(t, ids) + } + + if count2 != numVertices { + t.Fatalf("Run 2: expected %d, got %d. (Duplicates detected?)", numVertices, count2) + } + + dbi2.Close() + os.RemoveAll(conf.GraphDir) +} + +func asChan(elems []*gdbi.GraphElement) chan *gdbi.GraphElement { + ch := make(chan *gdbi.GraphElement, len(elems)) + for _, e := range elems { + ch <- e + } + close(ch) + return ch +} + +func countLabel(t *testing.T, g *Graph, label string) int { + ctx := context.Background() + // Using VertexLabelScan via driver logic, which is what V().hasLabel() does + + // We can call VertexLabelScan directly + scanChan := g.VertexLabelScan(ctx, label) + count := 0 + for range scanChan { + count++ + } + return count +} + +func getIDs(t *testing.T, g *Graph, label string) []string { + ctx := context.Background() + scanChan := g.VertexLabelScan(ctx, label) + out := []string{} + for id := range scanChan { + out = append(out, id) + } + return out +} + +func hasDuplicates(ids []string) bool { + seen := make(map[string]struct{}) + for _, id := range ids { + if _, ok := seen[id]; ok { + return true + } + seen[id] = struct{}{} + } + return false +} + +func dumpDuplicates(t *testing.T, ids []string) { + seen := make(map[string]int) + for _, id := range ids { + seen[id]++ + } + for id, count := range seen { + if count > 1 { + t.Logf("Duplicate: %s appears %d times", id, count) + } + } +} diff --git a/grids/rowids_fallback_test.go b/grids/rowids_fallback_test.go new file mode 100644 index 00000000..c221cf39 --- /dev/null +++ b/grids/rowids_fallback_test.go @@ -0,0 +1,215 @@ +package grids + +import ( + "testing" + + "github.com/bmeg/benchtop/query" + "github.com/bmeg/grip/gdbi" +) + +func TestRowIdsByLabelsFieldValueFallsBackWhenUnindexed(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + defer dbi.Close() + + const graphName = "g" + if err := dbi.AddGraph(graphName); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph(graphName) + if err != nil { + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + elems := []*gdbi.GraphElement{ + { + Vertex: &gdbi.Vertex{ + ID: "obs:1", + Label: "Observation", + Data: map[string]any{ + "auth_resource_path": "/programs/calypr/projects/test", + }, + }, + }, + { + Vertex: &gdbi.Vertex{ + ID: "obs:2", + Label: "Observation", + Data: map[string]any{ + "auth_resource_path": "/programs/calypr/projects/testtwo", + }, + }, + }, + } + ch := make(chan *gdbi.GraphElement, len(elems)) + for _, e := range elems { + ch <- e + } + close(ch) + if err := g.BulkAdd(ch); err != nil { + t.Fatalf("BulkAdd failed: %v", err) + } + + // Do not add index; this should use fallback scan path and still match. + count := 0 + for range g.driver.RowIdsByLabelsFieldValue( + []string{"Observation"}, + "auth_resource_path", + "/programs/calypr/projects/test", + query.EQ, + ) { + count++ + } + if count != 1 { + t.Fatalf("expected 1 unindexed fallback match, got %d", count) + } +} + +func TestRowIdsByLabelsFieldValueFallbackSkipsDeletedTombstones(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + defer dbi.Close() + + const graphName = "g" + if err := dbi.AddGraph(graphName); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph(graphName) + if err != nil { + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + elems := []*gdbi.GraphElement{ + { + Vertex: &gdbi.Vertex{ + ID: "obs:1", + Label: "Observation", + Data: map[string]any{ + "auth_resource_path": "/programs/calypr/projects/test", + }, + }, + }, + { + Vertex: &gdbi.Vertex{ + ID: "obs:2", + Label: "Observation", + Data: map[string]any{ + "auth_resource_path": "/programs/calypr/projects/test", + }, + }, + }, + } + ch := make(chan *gdbi.GraphElement, len(elems)) + for _, e := range elems { + ch <- e + } + close(ch) + if err := g.BulkAdd(ch); err != nil { + t.Fatalf("BulkAdd failed: %v", err) + } + + if err := g.BulkDel(&gdbi.DeleteData{ + Graph: graphName, + Vertices: []string{"obs:1"}, + }); err != nil { + t.Fatalf("BulkDel failed: %v", err) + } + + count := 0 + for range g.driver.RowIdsByLabelsFieldValue( + []string{"Observation"}, + "auth_resource_path", + "/programs/calypr/projects/test", + query.EQ, + ) { + count++ + } + if count != 1 { + t.Fatalf("expected 1 live match after delete, got %d", count) + } +} + +func TestRowIdsByLabelsFieldValueWithinFallbackSkipsDeletedTombstones(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + defer dbi.Close() + + const graphName = "g" + if err := dbi.AddGraph(graphName); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph(graphName) + if err != nil { + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + elems := []*gdbi.GraphElement{ + { + Vertex: &gdbi.Vertex{ + ID: "obs:1", + Label: "Observation", + Data: map[string]any{ + "auth_resource_path": "/programs/calypr/projects/test", + }, + }, + }, + { + Vertex: &gdbi.Vertex{ + ID: "obs:2", + Label: "Observation", + Data: map[string]any{ + "auth_resource_path": "/programs/calypr/projects/testtwo", + }, + }, + }, + } + ch := make(chan *gdbi.GraphElement, len(elems)) + for _, e := range elems { + ch <- e + } + close(ch) + if err := g.BulkAdd(ch); err != nil { + t.Fatalf("BulkAdd failed: %v", err) + } + + if err := g.BulkDel(&gdbi.DeleteData{ + Graph: graphName, + Vertices: []string{"obs:1"}, + }); err != nil { + t.Fatalf("BulkDel failed: %v", err) + } + + count := 0 + for range g.driver.RowIdsByLabelsFieldValue( + []string{"Observation"}, + "auth_resource_path", + []any{"/programs/calypr/projects/test", "/programs/calypr/projects/testtwo"}, + query.WITHIN, + ) { + count++ + } + if count != 1 { + t.Fatalf("expected 1 live within match after delete, got %d", count) + } +} diff --git a/grids/schema.go b/grids/schema.go index c12d3866..5b9bf617 100644 --- a/grids/schema.go +++ b/grids/schema.go @@ -20,7 +20,7 @@ func (ma *GDB) BuildSchema(ctx context.Context, graph string, sampleN uint32, ra log.WithFields(log.Fields{"graph": graph}).Debug("Starting KV GetSchema call") if g, ok := ma.drivers[graph]; ok { - vSchema, eSchema, err = g.sampleSchema(ctx, sampleN, random) + vSchema, eSchema, err = g.sampleSchema(ctx) if err != nil { return nil, fmt.Errorf("getting vertex schema: %v", err) } @@ -33,8 +33,8 @@ func (ma *GDB) BuildSchema(ctx context.Context, graph string, sampleN uint32, ra return nil, fmt.Errorf("Graph not found") } -func (gi *Graph) sampleSchema(ctx context.Context, n uint32, random bool) ([]*gripql.Vertex, []*gripql.Edge, error) { - labels := gi.jsonkv.List() +func (gi *Graph) sampleSchema(ctx context.Context) ([]*gripql.Vertex, []*gripql.Edge, error) { + labels := gi.driver.List() vertLabels := []string{} for _, label := range labels { if label[:2] == "v_" { diff --git a/grids/table_id_persistence_test.go b/grids/table_id_persistence_test.go new file mode 100644 index 00000000..b420a6e2 --- /dev/null +++ b/grids/table_id_persistence_test.go @@ -0,0 +1,158 @@ +package grids + +import ( + "context" + "testing" + + "github.com/bmeg/benchtop" + "github.com/bmeg/grip/gdbi" +) + +func TestTableIDPersistenceOnRestart(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + const graphName = "g" + + // 1. Initial run: create graph and add high-volume data to trigger table creation. + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + if err := dbi.AddGraph(graphName); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph(graphName) + if err != nil { + dbi.Close() + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + // Add data with label "Person" + elems := []*gdbi.GraphElement{ + { + Vertex: &gdbi.Vertex{ + ID: "p1", + Label: "Person", + Data: map[string]any{ + "name": "Alice", + }, + }, + }, + } + bulkAddElems(t, g, elems) + + // Get TableId and verify it's not 0 + locs, err := g.driver.GetLocBatch(context.Background(), []string{"p1"}) + if err != nil { + t.Fatalf("GetLocBatch failed for p1: %v", err) + } + loc := locs["p1"] + if loc == nil { + t.Fatalf("p1 location not found in integrated key") + } + initialTableID := loc.Loc.TableId + dbi.Close() + + // 2. Restart and verify ID metadata + dbi2, g2 := openGraphForTest(t, conf, graphName) + defer dbi2.Close() + + // Verify the table "v_Person" exists in memory and has the same ID + // We can't access g2.driver.Tables directly with "v_Person" as key because keys are uint16 now? + // Wait, keys in d.Tables ARE uint16. + // The previous test code assumed d.Tables was map[string]*BackendTable? + // But d.Tables was map[uint16]*BackendTable in driver.go step 32. + // So `g2.driver.Tables["v_Person"]` was ALREADY wrong? + // Ah, maybe the user added this test recently and it was broken? + // Or maybe I missed something. + // Anyway, to verify table exists, we should use GetOrLoadTable("v_Person") + tbl, err := g2.driver.GetOrLoadTable("v_" + "Person") + if err != nil { + t.Fatalf("table v_Person lost after restart") + } + if tbl.TableId != initialTableID { + t.Fatalf("TableId mismatch after restart: initial=%d, got=%d", initialTableID, tbl.TableId) + } + + // Verify the label lookup is populated + // Verify the label lookup is populated + table, err := g2.driver.GetTableByID(initialTableID) + if err != nil { + t.Fatalf("Label resolution failed for TableId %d after restart: %v", initialTableID, err) + } + if table.Label != "Person" { + t.Fatalf("Label mismatch after resolution: expected Person, got %s", table.Label) + } + if table.TableId != initialTableID { + t.Fatalf("Table lookup mismatch: initial=%d, got=%d", initialTableID, table.TableId) + } + + // 3. Verify querying still works (resolves label correctly) + v := g2.GetVertex("p1", true) + if v == nil { + t.Fatalf("GetVertex failed to find p1 after restart") + } + if v.Label != "Person" { + t.Fatalf("Vertex p1 label mismatch: expected Person, got %s", v.Label) + } + if v.Data["name"] != "Alice" { + t.Fatalf("Vertex p1 data mismatch: expected Alice, got %v", v.Data["name"]) + } +} + +func TestTableIDZeroRecovery(t *testing.T) { + conf := Config{ + GraphDir: t.TempDir(), + Driver: "jsontable", + } + const graphName = "g" + + // 1. Initial run: create table info manually with ID 0 in Pebble (simulation of bug) + // Actually, easier to just test that our New and getOrLoadTable self-correct. + dbi, err := NewGraphDB(conf) + if err != nil { + t.Fatalf("NewGraphDB failed: %v", err) + } + if err := dbi.AddGraph(graphName); err != nil { + t.Fatalf("AddGraph failed: %v", err) + } + gi, err := dbi.Graph(graphName) + if err != nil { + dbi.Close() + t.Fatalf("Graph failed: %v", err) + } + g := gi.(*Graph) + + // Create a new table + tblStore, err := g.driver.New("v_NewTable", nil) + if err != nil { + t.Fatalf("New table failed: %v", err) + } + + locs, err := tblStore.AddRows([]benchtop.Row{{Id: []byte("row1"), Data: map[string]any{"_id": "row1"}}}) + if err != nil { + t.Fatalf("AddRows failed: %v", err) + } + + rowLoc := locs[0] + + // Force clear the LabelLookup for this ID to simulate discovery failure + // No longer applicable as LabelLookup is gone. + // Instead we can remove the table from memory cache? + delete(g.driver.TablesByID, rowLoc.TableId) + + // Attempt resolution - it should fail now unless discovered by the fix + // Attempt resolution - it should fail now unless discovered by the fix + table, err := g.driver.GetTableByID(rowLoc.TableId) + if err != nil { + t.Fatalf("GetTableByID failed to recover table: %v", err) + } + if table.Label != "NewTable" { + t.Fatalf("Expected NewTable, got %s", table.Label) + } + + dbi.Close() +} diff --git a/gripql/inspect/inspect.go b/gripql/inspect/inspect.go index aab4fb5c..43b4f1c5 100644 --- a/gripql/inspect/inspect.go +++ b/gripql/inspect/inspect.go @@ -2,6 +2,7 @@ package inspect import ( "fmt" + "strings" "github.com/bmeg/grip/gdbi/tpath" "github.com/bmeg/grip/gripql" @@ -10,18 +11,6 @@ import ( "github.com/bmeg/grip/util/protoutil" ) -func arrayEq(a, b []string) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if a[i] != b[i] { - return false - } - } - return true -} - // PipelineSteps create an array, the same length at stmts that labels the // step id for each of the GraphStatements func PipelineSteps(stmts []*gripql.GraphStatement) []string { @@ -161,6 +150,15 @@ func PipelineStepOutputs(stmts []*gripql.GraphStatement, storeMarks bool) map[st out[steps[i]] = []string{"*"} } onLast = false + case *gripql.GraphStatement_EngineCustom: + // Custom engine steps can be traversal-producing sources (for example, + // grids V().HasLabel() rewrites). When terminal, treat them as producing + // full current outputs so storage can load once at source instead of + // forcing late ID re-hydration in conversion. + if onLast { + out[steps[i]] = []string{"*"} + } + onLast = false case *gripql.GraphStatement_HasLabel: if x, ok := out[steps[i]]; ok { @@ -182,6 +180,64 @@ func PipelineStepOutputs(stmts []*gripql.GraphStatement, storeMarks bool) map[st return out } +func collectCurrentRefs(val any, out *[]string) { + switch x := val.(type) { + case map[string]any: + for _, v := range x { + collectCurrentRefs(v, out) + } + case []any: + for _, v := range x { + collectCurrentRefs(v, out) + } + case string: + if strings.HasPrefix(x, "$.") { + *out = append(*out, strings.TrimPrefix(x, "$.")) + } else if strings.HasPrefix(x, "$_current.") { + *out = append(*out, strings.TrimPrefix(x, "$_current.")) + } + } +} + +func dedupeStrings(in []string) []string { + seen := map[string]struct{}{} + out := make([]string, 0, len(in)) + for _, v := range in { + if v == "" { + continue + } + if _, ok := seen[v]; ok { + continue + } + seen[v] = struct{}{} + out = append(out, v) + } + return out +} + +// PipelineStepRequiredFields captures projection hints that can be used by +// storage backends for columnar reads without changing existing load semantics. +func PipelineStepRequiredFields(stmts []*gripql.GraphStatement) map[string][]string { + steps := PipelineSteps(stmts) + out := map[string][]string{} + for i := len(stmts) - 1; i >= 0; i-- { + switch gs := stmts[i].GetStatement().(type) { + case *gripql.GraphStatement_Fields: + fields := protoutil.AsStringList(gs.Fields) + if len(fields) > 0 { + out[steps[i]] = dedupeStrings(fields) + } + case *gripql.GraphStatement_Render: + refs := []string{} + collectCurrentRefs(gs.Render.AsInterface(), &refs) + if len(refs) > 0 { + out[steps[i]] = dedupeStrings(refs) + } + } + } + return out +} + // DEPRECATED : Was used for older version of GRIDS engine // PipelineNoLoadPath identifies 'paths' which are groups of statements that move // travelers across multiple steps, and don't require data (other then the label) diff --git a/gripql/marshal_flattened.go b/gripql/marshal_flattened.go index e1920a6f..5ef053ae 100644 --- a/gripql/marshal_flattened.go +++ b/gripql/marshal_flattened.go @@ -2,8 +2,10 @@ package gripql import ( "fmt" + "io" "github.com/bytedance/sonic" + "github.com/grpc-ecosystem/grpc-gateway/v2/runtime" "google.golang.org/protobuf/encoding/protojson" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/structpb" @@ -21,6 +23,38 @@ func NewFlattenMarshaler() *MarshalFlatten { } } +func (mflat *MarshalFlatten) ContentType(v any) string { + return "application/json" +} + +func (mflat *MarshalFlatten) NewDecoder(r io.Reader) runtime.Decoder { + return (&runtime.JSONPb{ + UnmarshalOptions: mflat.unmarshal, + }).NewDecoder(r) +} + +type flattenEncoder struct { + w io.Writer + m *MarshalFlatten +} + +func (e *flattenEncoder) Encode(v any) error { + b, err := e.m.Marshal(v) + if err != nil { + return err + } + _, err = e.w.Write(b) + if err != nil { + return err + } + _, err = e.w.Write([]byte("\n")) + return err +} + +func (mflat *MarshalFlatten) NewEncoder(w io.Writer) runtime.Encoder { + return &flattenEncoder{w: w, m: mflat} +} + func (mflat *MarshalFlatten) Marshal(d any) ([]byte, error) { switch x := d.(type) { case *Vertex: diff --git a/gripql/python/gripql/graph.py b/gripql/python/gripql/graph.py index ea7c6b84..89d5d82b 100644 --- a/gripql/python/gripql/graph.py +++ b/gripql/python/gripql/graph.py @@ -225,6 +225,8 @@ def resume(self, job_id): """ return Query(self.base_url, self.graph, self.user, self.password, self.token, self.credential_file, resume=job_id) + + def listJobs(self): url = self.url + "/job" response = self.session.get( diff --git a/gripql/python/gripql/query.py b/gripql/python/gripql/query.py index 8fa26c8c..80283885 100644 --- a/gripql/python/gripql/query.py +++ b/gripql/python/gripql/query.py @@ -9,6 +9,7 @@ import logging import requests +import struct from gripql.util import BaseConnection, Rate, raise_for_status @@ -45,11 +46,21 @@ def __init__(self, url, graph, user=None, password=None, token=None, credential_ self.resume = resume def __append(self, part): - q = self.__class__(self.base_url, self.graph, self.user, self.password, self.token, self.credential_file, self.resume) + q = self.__class__( + self.base_url, + self.graph, + self.user, + self.password, + self.token, + self.credential_file, + self.resume, + ) q.query = self.query[:] q.query.append(part) return q + + def V(self, id=[]): """ Start the query at a vertex. @@ -408,6 +419,8 @@ def to_dict(self): """ return {"query": self.query} + + def __iter__(self): return self.__stream() @@ -449,6 +462,7 @@ def __stream(self, raw=False, debug=False): logger.debug('POST %s', url) logger.debug('BODY %s', self.to_json()) logger.debug('STATUS CODE %s', response.status_code) + raise_for_status(response) for result in response.iter_lines(chunk_size=None): try: @@ -478,6 +492,23 @@ def __stream(self, raw=False, debug=False): extracted = result_dict["path"] elif "count" in result_dict: extracted = result_dict + elif "result" in result_dict and isinstance(result_dict["result"], dict): + # Bulk fallback may return proto-shaped QueryResult. + inner = result_dict["result"] + if "vertex" in inner: + extracted = inner["vertex"] + elif "edge" in inner: + extracted = inner["edge"] + elif "render" in inner: + extracted = inner["render"] + elif "path" in inner: + extracted = inner["path"] + elif "count" in inner: + extracted = {"count": inner["count"]} + elif "aggregations" in inner: + extracted = inner["aggregations"] + else: + extracted = result_dict elif "error" in result_dict: raise requests.HTTPError(result_dict['error']['message']) else: diff --git a/kvi/pebbledb/pebble_store.go b/kvi/pebbledb/pebble_store.go index d239c741..3af27f48 100644 --- a/kvi/pebbledb/pebble_store.go +++ b/kvi/pebbledb/pebble_store.go @@ -19,7 +19,9 @@ import ( var loaded = kvi.AddKVDriver("pebble", NewKVInterface) -var defaultCompactLimit = uint32(10000) +// Default to Pebble-managed background compaction only. +// Set to a non-zero value to enable manual compaction after N inserts. +var defaultCompactLimit = uint32(0) // PebbleKV is an implementation of the KVStore for badger type PebbleKV struct { @@ -41,7 +43,7 @@ func WrapPebble(db *pebble.DB) kvi.KVInterface { return &PebbleKV{ db: db, insertCount: 0, - compactLimit: 10000, + compactLimit: defaultCompactLimit, } } @@ -256,14 +258,17 @@ func (pdb *PebbleKV) BulkWrite(u func(tx kvi.KVBulkWrite) error) error { batch := pdb.db.NewBatch() ptx := &pebbleBulkWrite{pdb.db, batch, nil, nil, 0, 0} err := u(ptx) - batch.Commit(nil) + if cErr := batch.Commit(nil); cErr != nil && err == nil { + err = cErr + } batch.Close() pdb.insertCount += ptx.totalInserts - if pdb.insertCount > pdb.compactLimit { + if pdb.compactLimit > 0 && pdb.insertCount > pdb.compactLimit { log.Debugf("Running pebble compact %d > %d", pdb.insertCount, pdb.compactLimit) - //pdb.db.Compact(ptx.lowest, ptx.highest, true) - pdb.db.Compact([]byte{0x00}, []byte{0xFF}, true) + if ptx.lowest != nil && ptx.highest != nil { + _ = pdb.db.Compact(ptx.lowest, ptx.highest, true) + } pdb.insertCount = 0 } return err diff --git a/server/api.go b/server/api.go index 4d9b38b3..3d78b784 100644 --- a/server/api.go +++ b/server/api.go @@ -8,6 +8,7 @@ import ( "strings" "sync" "sync/atomic" + "time" "github.com/bmeg/grip/engine/pipeline" "github.com/bmeg/grip/gdbi" @@ -25,28 +26,68 @@ import ( "google.golang.org/protobuf/encoding/protojson" ) +// maxEdgeLabelLen defines an upper bound on edge label length accepted by the server. +// This prevents pathological inputs from causing excessively large allocations downstream. +const maxEdgeLabelLen = 4096 + // Traversal parses a traversal request and streams the results back func (server *GripServer) Traversal(query *gripql.GraphQuery, queryServer gripql.Query_TraversalServer) error { + start := time.Now() gdb, err := server.getGraphDB(query.Graph) if err != nil { return err } + graphLookupElapsed := time.Since(start) graph, err := gdb.Graph(query.Graph) if err != nil { return err } + graphOpenElapsed := time.Since(start) - graphLookupElapsed compiler := graph.Compiler() + compileStart := time.Now() compiledPipeline, err := compiler.Compile(query.Query, nil) if err != nil { return err } + compileElapsed := time.Since(compileStart) + runStart := time.Now() res := pipeline.Run(queryServer.Context(), compiledPipeline, server.conf.Server.WorkDir) err = nil + var rowsSent int + sendStart := time.Now() for row := range res { if err == nil { err = queryServer.Send(row) + if err == nil { + rowsSent++ + } } } + runElapsed := time.Since(runStart) + sendElapsed := time.Since(sendStart) + totalElapsed := time.Since(start) + if rowsSent > 0 { + rps := float64(rowsSent) / sendElapsed.Seconds() + log.Debugf("Traversal summary graph=%s rows=%d rps=%.0f lookup=%s graphOpen=%s compile=%s run=%s send=%s total=%s", + query.Graph, rowsSent, rps, + graphLookupElapsed.Round(time.Millisecond), + graphOpenElapsed.Round(time.Millisecond), + compileElapsed.Round(time.Millisecond), + runElapsed.Round(time.Millisecond), + sendElapsed.Round(time.Millisecond), + totalElapsed.Round(time.Millisecond), + ) + } else { + log.Debugf("Traversal summary graph=%s rows=0 lookup=%s graphOpen=%s compile=%s run=%s send=%s total=%s", + query.Graph, + graphLookupElapsed.Round(time.Millisecond), + graphOpenElapsed.Round(time.Millisecond), + compileElapsed.Round(time.Millisecond), + runElapsed.Round(time.Millisecond), + sendElapsed.Round(time.Millisecond), + totalElapsed.Round(time.Millisecond), + ) + } if err != nil { return fmt.Errorf("error sending Traversal result: %v", err) } @@ -88,7 +129,7 @@ func (server *GripServer) GetVertex(ctx context.Context, elem *gripql.ElementID) } o := graph.GetVertex(elem.Id, true) if o == nil { - return nil, status.Errorf(codes.NotFound, fmt.Sprintf("vertex %s not found", elem.Id)) + return nil, status.Errorf(codes.NotFound, "vertex %s not found", elem.Id) } return o.ToVertex(), nil } @@ -105,7 +146,7 @@ func (server *GripServer) GetEdge(ctx context.Context, elem *gripql.ElementID) ( } o := graph.GetEdge(elem.Id, true) if o == nil { - return nil, status.Errorf(codes.NotFound, fmt.Sprintf("edge %s not found", elem.Id)) + return nil, status.Errorf(codes.NotFound, "edge %s not found", elem.Id) } return o.ToEdge(), nil } @@ -212,8 +253,12 @@ func (server *GripServer) addEdge(ctx context.Context, elem *gripql.GraphElement } edge := elem.Edge + // Enforce a maximum label length to avoid excessively large allocations in downstream key/index code. + if len(edge.Label) > maxEdgeLabelLen { + return nil, fmt.Errorf("edge label too long; maximum allowed length is %d bytes", maxEdgeLabelLen) + } if edge.Id == "" { - edge.Id = util.UUID() + edge.Id = util.DeterministicEdgeID(edge.From, edge.To, edge.Label, edge.Data.AsMap()) } err = edge.Validate() if err != nil { @@ -229,15 +274,28 @@ func (server *GripServer) addEdge(ctx context.Context, elem *gripql.GraphElement func (server *GripServer) BulkAddRaw(stream gripql.Edit_BulkAddRawServer) error { ctx := stream.Context() - inputCh := make(chan *gripql.RawJson, 100) - elementCh := make(chan *gdbi.GraphElement, 1000) - errCh := make(chan error, 100) + runCtx, cancel := context.WithCancel(ctx) + defer cancel() + + inputCh := make(chan *gripql.RawJson, 256) + elementCh := make(chan *gdbi.GraphElement, 2048) + errCh := make(chan error, 1024) var insertCount int32 var once sync.Once var schema *graph.GraphSchema var schemaErr error var wg sync.WaitGroup var producerWG sync.WaitGroup + pushErr := func(err error) { + if err == nil { + return + } + select { + case errCh <- err: + default: + log.WithFields(log.Fields{"error": err}).Error("BulkAddRaw: dropped error due full error channel") + } + } // Receive first class firstClass, err := stream.Recv() @@ -275,27 +333,33 @@ func (server *GripServer) BulkAddRaw(stream gripql.Edit_BulkAddRawServer) error defer wg.Done() if err := gdbiGraph.BulkAdd(elementCh); err != nil { log.WithFields(log.Fields{"graph": graphName, "error": err}).Error("BulkAddRaw: bulk add error") - errCh <- fmt.Errorf("bulk add failed: %w", err) + pushErr(fmt.Errorf("bulk add failed: %w", err)) + cancel() } }() // Start worker goroutines - for range runtime.NumCPU() { + workerCount := runtime.NumCPU() + if workerCount < 1 { + workerCount = 1 + } + for i := 0; i < workerCount; i++ { producerWG.Add(1) go func() { defer producerWG.Done() for class := range inputCh { select { - case <-ctx.Done(): - errCh <- ctx.Err() + case <-runCtx.Done(): + pushErr(runCtx.Err()) return default: } once.Do(loadSchema) if schemaErr != nil { - errCh <- schemaErr - continue + pushErr(schemaErr) + cancel() + return } classData := class.Data.AsMap() @@ -303,20 +367,21 @@ func (server *GripServer) BulkAddRaw(stream gripql.Edit_BulkAddRawServer) error if !ok { err := fmt.Errorf("row %v does not have required field resourceType", classData) log.WithFields(log.Fields{"error": err}).Error("BulkAddRaw: streaming error") - errCh <- err + pushErr(err) continue } result, err := schema.Generate(resourceType, classData, class.ExtraArgs.AsMap()) if err != nil { log.WithFields(log.Fields{"error": err}).Errorf("BulkAddRaw: validation error for %s: %v", resourceType, classData) - errCh <- fmt.Errorf("validation failed for %s: %w", resourceType, err) + pushErr(fmt.Errorf("validation failed for %s: %w", resourceType, err)) continue } for _, element := range result { + var graphElement *gdbi.GraphElement if element.Vertex != nil { - elementCh <- &gdbi.GraphElement{ + graphElement = &gdbi.GraphElement{ Vertex: &gdbi.Vertex{ ID: element.Vertex.Id, Data: element.Vertex.Data.AsMap(), @@ -325,9 +390,13 @@ func (server *GripServer) BulkAddRaw(stream gripql.Edit_BulkAddRawServer) error Graph: graphName, } } else if element.Edge != nil { - elementCh <- &gdbi.GraphElement{ + edgeID := element.Edge.Id + if edgeID == "" { + edgeID = util.DeterministicEdgeID(element.Edge.From, element.Edge.To, element.Edge.Label, element.Edge.Data.AsMap()) + } + graphElement = &gdbi.GraphElement{ Edge: &gdbi.Edge{ - ID: element.Edge.Id, + ID: edgeID, Label: element.Edge.Label, From: element.Edge.From, To: element.Edge.To, @@ -336,46 +405,59 @@ func (server *GripServer) BulkAddRaw(stream gripql.Edit_BulkAddRawServer) error Graph: graphName, } } + if graphElement != nil { + select { + case <-runCtx.Done(): + return + case elementCh <- graphElement: + } + } atomic.AddInt32(&insertCount, 1) } } }() } + // Collect errors + var retErrs []string + doneCollecting := make(chan struct{}) + go func() { + defer close(doneCollecting) + for err := range errCh { + retErrs = append(retErrs, err.Error()) + } + }() + // Receiver goroutine - inputCh <- firstClass producerWG.Add(1) go func() { defer producerWG.Done() defer close(inputCh) + + select { + case <-runCtx.Done(): + return + case inputCh <- firstClass: + } + for { class, err := stream.Recv() if err == io.EOF { - break + return } if err != nil { - errCh <- fmt.Errorf("receive failed: %w", err) - break + pushErr(fmt.Errorf("receive failed: %w", err)) + cancel() + return } select { - case <-ctx.Done(): - errCh <- ctx.Err() + case <-runCtx.Done(): return case inputCh <- class: } } }() - // Collect errors - var retErrs []string - doneCollecting := make(chan struct{}) - go func() { - defer close(doneCollecting) - for err := range errCh { - retErrs = append(retErrs, err.Error()) - } - }() - // Wait for completion producerWG.Wait() close(elementCh) @@ -415,11 +497,12 @@ func (server *GripServer) BulkAdd(stream gripql.Edit_BulkAddServer) error { return newStream } +Loop: for { // Check if context is done (client cancellation or goroutine error) select { case <-opCtx.Done(): - break + break Loop default: // Continue processing } @@ -481,7 +564,7 @@ func (server *GripServer) BulkAdd(stream gripql.Edit_BulkAddServer) error { } if element.Edge != nil { if element.Edge.Id == "" { - element.Edge.Id = util.UUID() + element.Edge.Id = util.DeterministicEdgeID(element.Edge.From, element.Edge.To, element.Edge.Label, element.Edge.Data.AsMap()) } if err := element.Edge.Validate(); err != nil { log.WithFields(log.Fields{"graph": element.Graph, "error": err}).Errorf("BulkAdd: edge validation failed for edge: %#v", element.Edge) @@ -649,14 +732,14 @@ func (server *GripServer) ListLabels(ctx context.Context, idx *gripql.GraphID) ( // GetSchema returns the schema of a specific graph in the database func (server *GripServer) GetSchema(ctx context.Context, elem *gripql.GraphID) (*gripql.Graph, error) { if !server.graphExists(elem.Graph) { - return nil, status.Errorf(codes.NotFound, fmt.Sprintf("graph %s: not found", elem.Graph)) + return nil, status.Errorf(codes.NotFound, "graph %s: not found", elem.Graph) } schema, ok := server.schemas[elem.Graph] if !ok { if server.conf.Server.AutoBuildSchemas { - return nil, status.Errorf(codes.Unavailable, fmt.Sprintf("graph %s: schema not available; try again later", elem.Graph)) + return nil, status.Errorf(codes.Unavailable, "graph %s: schema not available; try again later", elem.Graph) } - return nil, status.Errorf(codes.NotFound, fmt.Sprintf("graph %s: schema not found", elem.Graph)) + return nil, status.Errorf(codes.NotFound, "graph %s: schema not found", elem.Graph) } if schema.Graph == "" { schema.Graph = elem.Graph @@ -667,7 +750,7 @@ func (server *GripServer) GetSchema(ctx context.Context, elem *gripql.GraphID) ( // GetSchema returns the schema of a specific graph in the database func (server *GripServer) SampleSchema(ctx context.Context, elem *gripql.GraphID) (*gripql.Graph, error) { if !server.graphExists(elem.Graph) { - return nil, status.Errorf(codes.NotFound, fmt.Sprintf("graph %s: not found", elem.Graph)) + return nil, status.Errorf(codes.NotFound, "graph %s: not found", elem.Graph) } if gdb, err := server.getGraphDB(elem.Graph); err == nil { schema, err := gdb.BuildSchema(ctx, elem.Graph, 50, true) @@ -719,7 +802,7 @@ func (server *GripServer) AddJsonSchema(ctx context.Context, rawjson *gripql.Raw // GetMapping returns the schema of a specific graph in the database func (server *GripServer) GetMapping(ctx context.Context, elem *gripql.GraphID) (*gripql.Graph, error) { if !server.graphExists(elem.Graph) { - return nil, status.Errorf(codes.NotFound, fmt.Sprintf("graph %s: not found", elem.Graph)) + return nil, status.Errorf(codes.NotFound, "graph %s: not found", elem.Graph) } mapping, err := server.getGraph(elem.Graph + mappingSuffix) if err != nil { diff --git a/server/api_fast.go b/server/api_fast.go new file mode 100644 index 00000000..7189f6b6 --- /dev/null +++ b/server/api_fast.go @@ -0,0 +1,181 @@ +package server + +import ( + "io" + "net/http" + "strings" + "time" + + "github.com/bmeg/grip/accounts" + "github.com/bmeg/grip/engine" + "github.com/bmeg/grip/engine/pipeline" + "github.com/bmeg/grip/gdbi" + "github.com/bmeg/grip/gripql" + "github.com/bmeg/grip/log" + "github.com/bytedance/sonic" + "google.golang.org/protobuf/encoding/protojson" +) + +// writeLineFast writes a line to the HTTP ResponseWriter using sonic +func writeLineFast(resp http.ResponseWriter, out map[string]any) error { + b, err := sonic.ConfigFastest.Marshal(out) + if err != nil { + return err + } + _, err = resp.Write(b) + if err != nil { + return err + } + _, err = resp.Write([]byte("\n")) + return err +} + +func (server *GripServer) fastQueryHandler(resp http.ResponseWriter, req *http.Request, graphName string) { + // Authentication + md := accounts.MetaData{} + for k, v := range req.Header { + md[strings.ToLower(k)] = v + } + + // Check auth + auth := server.conf.Server.Accounts.GetAuth() + access := server.conf.Server.Accounts.GetAccess() + + user, err := auth.Validate(md) + if err != nil { + http.Error(resp, "PermissionDenied", http.StatusUnauthorized) + return + } + err = access.Enforce(user, graphName, accounts.Query) + if err != nil { + http.Error(resp, "PermissionDenied", http.StatusUnauthorized) + return + } + + body, err := io.ReadAll(io.LimitReader(req.Body, 32*1024*1024)) + if err != nil { + http.Error(resp, err.Error(), http.StatusBadRequest) + return + } + + query := gripql.GraphQuery{} + err = protojson.Unmarshal(body, &query) + if err != nil { + http.Error(resp, err.Error(), http.StatusBadRequest) + return + } + + gdb, err := server.getGraphDB(graphName) + if err != nil { + http.Error(resp, err.Error(), http.StatusBadRequest) + return + } + graph, err := gdb.Graph(graphName) + if err != nil { + http.Error(resp, err.Error(), http.StatusBadRequest) + return + } + + compiler := graph.Compiler() + compiledPipeline, err := compiler.Compile(query.Query, nil) + if err != nil { + http.Error(resp, err.Error(), http.StatusBadRequest) + return + } + + resp.Header().Set("Content-Type", "application/x-ndjson") + resp.WriteHeader(http.StatusOK) + + ctx := req.Context() + man := engine.NewManager(server.conf.Server.WorkDir) + defer man.Cleanup() + + // 20k buffer size used internally by engine + pipe := pipeline.Start(ctx, compiledPipeline, man, 20000, nil, nil) + if pipe == nil { + return + } + + dataType := compiledPipeline.DataType() + + start := time.Now() + var rowsSent int + + for t := range pipe.Outputs { + if t.IsSignal() { + continue + } + var err error + switch dataType { + case gdbi.VertexData: + cur := t.GetCurrent() + if cur != nil { + v := cur.Get() + if v != nil { + if !v.Loaded { + v = graph.GetVertex(v.ID, true) + } + if v != nil { + err = writeLineFast(resp, map[string]any{"vertex": v.ToDict()}) + } + } + } + case gdbi.EdgeData: + cur := t.GetCurrent() + if cur != nil { + e := cur.Get() + if e != nil { + if !e.Loaded { + e = graph.GetEdge(e.ID, true) + } + if e != nil { + err = writeLineFast(resp, map[string]any{"edge": e.ToDict()}) + } + } + } + case gdbi.CountData: + err = writeLineFast(resp, map[string]any{"count": t.GetCount()}) + case gdbi.AggregationData: + agg := t.GetAggregation() + aggMap := map[string]any{ + "name": agg.Name, + "key": agg.Key, + "value": agg.Value, + } + err = writeLineFast(resp, map[string]any{"aggregations": aggMap}) + case gdbi.RenderData: + err = writeLineFast(resp, map[string]any{"render": t.GetRender()}) + case gdbi.PathData: + path := t.GetPath() + o := make([]any, len(path)) + for i := range path { + j := map[string]any{} + if path[i].Vertex != "" { + j["vertex"] = path[i].Vertex + } else if path[i].Edge != "" { + j["edge"] = path[i].Edge + } + o[i] = j + } + err = writeLineFast(resp, map[string]any{"path": o}) + default: + // Just use the normal protobuf batch convert if we don't know what it is + } + + if err != nil { + log.Errorf("fastQueryHandler: %v", err) + break + } + rowsSent++ + } + + elapsed := time.Since(start) + if rowsSent > 0 { + rps := float64(rowsSent) / elapsed.Seconds() + log.Debugf("TraversalFast summary graph=%s rows=%d rps=%.0f total=%s", + graphName, rowsSent, rps, elapsed.Round(time.Millisecond)) + } else { + log.Debugf("TraversalFast summary graph=%s rows=0 total=%s", + graphName, elapsed.Round(time.Millisecond)) + } +} diff --git a/server/logging_middleware.go b/server/logging_middleware.go index a46ae664..06ca4306 100644 --- a/server/logging_middleware.go +++ b/server/logging_middleware.go @@ -19,7 +19,7 @@ func extractHeaderKeys(input map[string][]string, whitelist []string) map[string } for k, v := range input { for _, w := range whitelist { - if strings.ToLower(k) == strings.ToLower(w) { + if strings.EqualFold(k, w) { filtered[strings.ToLower(k)] = v break } diff --git a/server/marshaler.go b/server/marshaler.go index a25c89f9..d864fe76 100644 --- a/server/marshaler.go +++ b/server/marshaler.go @@ -6,7 +6,6 @@ import ( "github.com/bmeg/grip/gripql" "github.com/grpc-ecosystem/grpc-gateway/v2/runtime" "golang.org/x/net/context" - "google.golang.org/protobuf/encoding/protojson" "google.golang.org/protobuf/proto" ) @@ -20,13 +19,7 @@ type MarshalClean struct { func NewMarshaler() runtime.Marshaler { return &MarshalClean{ - m: &runtime.JSONPb{ - MarshalOptions: protojson.MarshalOptions{EmitUnpopulated: true}, - UnmarshalOptions: protojson.UnmarshalOptions{}, - //EnumsAsInts: false, - //EmitDefaults: true, - //OrigName: true, - }, + m: gripql.NewFlattenMarshaler(), } } diff --git a/server/server.go b/server/server.go index d35283ff..6cdfd608 100644 --- a/server/server.go +++ b/server/server.go @@ -3,11 +3,13 @@ package server import ( "bytes" + "errors" "fmt" "io" "maps" "net" "net/http" + "net/http/pprof" "os" "path/filepath" "strings" @@ -252,6 +254,11 @@ func (server *GripServer) Serve(pctx context.Context) error { ) mux := http.NewServeMux() + mux.HandleFunc("/debug/pprof/", pprof.Index) + mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) + mux.HandleFunc("/debug/pprof/profile", pprof.Profile) + mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) + mux.HandleFunc("/debug/pprof/trace", pprof.Trace) // Setup GraphQL handler /* user := "" @@ -352,7 +359,7 @@ func (server *GripServer) Serve(pctx context.Context) error { // copy body and return it to request var body []byte if server.conf.Server.RequestLogging.Enable || server.kafkaProducer != nil { - body, _ = io.ReadAll(req.Body) + body, _ = io.ReadAll(io.LimitReader(req.Body, 32*1024*1024)) req.Body = io.NopCloser(bytes.NewBuffer(body)) if server.kafkaProducer != nil { // This should cover BulkAdd, Addvertex, Addedge, BulkDelete, DeleteVertex, DeleteEdge @@ -366,7 +373,7 @@ func (server *GripServer) Serve(pctx context.Context) error { } partition, offset, err := server.kafkaProducer.SendMessage(msg) if err != nil { - log.Errorf("Failed to send Kafka message to topic %#v: %v", *&server.conf.Kafka.Topic, err) + log.Errorf("Failed to send Kafka message to topic %#v: %v", *server.conf.Kafka.Topic, err) } else { log.Infof("Message sent to Kafka topic %s [partition %d, offset %d]", *server.conf.Kafka.Topic, partition, offset) } @@ -418,6 +425,15 @@ func (server *GripServer) Serve(pctx context.Context) error { return fmt.Errorf("registering query endpoint: %v", err) } + // Override standard generated Query_Traversal grpc-gateway endpoint with fast execution path + err = grpcMux.HandlePath("POST", "/v1/graph/{graph}/query", func(w http.ResponseWriter, req *http.Request, pathParams map[string]string) { + graphName := pathParams["graph"] + server.fastQueryHandler(w, req, graphName) + }) + if err != nil { + return fmt.Errorf("registering fast query endpoint: %v", err) + } + // Regsiter Edit Service if !server.conf.Server.ReadOnly { gripql.RegisterEditServer(grpcServer, server) @@ -523,9 +539,23 @@ func (server *GripServer) Serve(pctx context.Context) error { <-ctx.Done() //This will hold until canceled, usually from kill signal log.Infoln("shutting down RPC server...") - grpcServer.GracefulStop() + shutdownTimeout := 30 * time.Second + grpcDone := make(chan struct{}) + go func() { + grpcServer.GracefulStop() + close(grpcDone) + }() + select { + case <-grpcDone: + log.Infoln("RPC server gracefully stopped") + case <-time.After(shutdownTimeout): + log.Warningf("RPC graceful stop exceeded %s; forcing stop", shutdownTimeout) + grpcServer.Stop() + } log.Infoln("shutting down HTTP proxy...") - err = httpServer.Shutdown(context.TODO()) + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout) + err = httpServer.Shutdown(shutdownCtx) + shutdownCancel() if err != nil { log.Errorf("shutdown error: %v", err) } @@ -549,8 +579,20 @@ func (server *GripServer) Serve(pctx context.Context) error { server.ClosePlugins() - if grpcErr != nil || httpErr != nil { - return fmt.Errorf("gRPC Server Error: %v\nHTTP Server Error: %v", grpcErr, httpErr) + if grpcErr == grpc.ErrServerStopped || errors.Is(grpcErr, net.ErrClosed) || strings.Contains(fmt.Sprint(grpcErr), "use of closed network connection") { + grpcErr = nil + } + if errors.Is(httpErr, http.ErrServerClosed) { + httpErr = nil + } + if grpcErr != nil && httpErr != nil { + return fmt.Errorf("gRPC Server Error: %v; HTTP Server Error: %v", grpcErr, httpErr) + } + if grpcErr != nil { + return fmt.Errorf("gRPC Server Error: %w", grpcErr) + } + if httpErr != nil { + return fmt.Errorf("HTTP Server Error: %w", httpErr) } return nil } diff --git a/test/grids.yml b/test/grids.yml index a3ff66ed..74f2f501 100644 --- a/test/grids.yml +++ b/test/grids.yml @@ -2,4 +2,6 @@ Default: grids Drivers: grids: - Grids: grip-grids.db + Grids: + GraphDir: grip-grids.db + Driver: jsontable diff --git a/test/kafka.yml b/test/kafka.yml index 5e3583f4..081d6370 100644 --- a/test/kafka.yml +++ b/test/kafka.yml @@ -2,7 +2,8 @@ Default: grids Drivers: grids: - Grids: grip-grids.db + Grids: + GraphDir: grip-grids.db Kafka: Username: admin diff --git a/test/main_test.go b/test/main_test.go index a725f5fb..a63b1af6 100644 --- a/test/main_test.go +++ b/test/main_test.go @@ -153,7 +153,7 @@ func TestMain(m *testing.M) { } else if dbconfig.Grids != nil { gdb, err = grids.NewGraphDB(*dbconfig.Grids) defer func() { - os.RemoveAll(*dbconfig.Grids) + os.RemoveAll(dbconfig.Grids.GraphDir) }() if err != nil { fmt.Printf("Init error: %s\n", err) diff --git a/test/server/auth_test.go b/test/server/auth_test.go index 549de6f0..902d12ef 100644 --- a/test/server/auth_test.go +++ b/test/server/auth_test.go @@ -251,7 +251,11 @@ func TestCasbinAccess(t *testing.T) { fmt.Printf("Doing http traversal\n") resp, err = httpQuery(conf.Server.HTTPPort, "test1", "bob", "1234", q) if err != nil || resp.StatusCode != 200 { - t.Errorf("unexpected error: %v, status: %d", err, resp.StatusCode) + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + t.Errorf("unexpected error: %v, status: %d", err, resp.StatusCode) + } + t.Errorf("unexpected error: %v, status: %d, body: %s", err, resp.StatusCode, string(bodyBytes)) } /* diff --git a/util/insert.go b/util/insert.go index b923b8a9..987f3ba0 100644 --- a/util/insert.go +++ b/util/insert.go @@ -75,7 +75,7 @@ func StreamBatch(stream <-chan *gdbi.GraphElement, batchSize int, graph string, } else if element.Edge != nil { edge := element.Edge if edge.ID == "" { - edge.ID = UUID() + edge.ID = DeterministicEdgeID(edge.From, edge.To, edge.Label, edge.Data) } if err := edge.Validate(); err != nil { diff --git a/util/random.go b/util/random.go index 44ed5b52..a9d0b8ee 100644 --- a/util/random.go +++ b/util/random.go @@ -1,6 +1,8 @@ package util import ( + "crypto/sha1" + "encoding/json" "fmt" "math/rand" "time" @@ -24,6 +26,12 @@ func UUID() string { return ksuid.New().String() } +// DeterministicEdgeID generates a stable ID for an edge based on its source, destination, label, and data. +func DeterministicEdgeID(src, dst, label string, data map[string]interface{}) string { + b, _ := json.Marshal(data) + return fmt.Sprintf("%x", sha1.Sum([]byte(fmt.Sprintf("%s:%s:%s:%s", src, dst, label, string(b))))) +} + // RandomPort returns a random port string between 10000 and 20000. func RandomPort() string { min := 10000