address the memory time bomb

breardon2011 · breardon2011 · commit 2b9b799bf713 · 2025-11-20T18:38:45.000-08:00
diff --git a/taco/internal/tfe/apply.go b/taco/internal/tfe/apply.go
@@ -5,6 +5,7 @@ import (
 	"net/http"
 	"os"
 	"strconv"
+	"strings"
 	"time"
 
 	"github.com/diggerhq/digger/opentaco/internal/auth"
@@ -100,15 +101,34 @@ func (h *TfeHandler) GetApplyLogs(c echo.Context) error {
 		return c.JSON(http.StatusNotFound, map[string]string{"error": "apply not found"})
 	}
 
-	// Try to get apply logs from blob storage
+	// Read apply logs from chunked S3 objects
+	// Chunks are stored as applies/{applyID}/chunks/00000001.log, 00000002.log, etc.
 	var logText string
-	applyLogBlobID := fmt.Sprintf("runs/%s/apply-logs.txt", run.ID)
+	chunkIndex := 1
+	var fullLogs strings.Builder
 	
-	logData, err := h.blobStore.DownloadBlob(ctx, applyLogBlobID)
-	if err == nil {
-		logText = string(logData)
-	} else {
-		// If logs don't exist yet, return placeholder
+	for {
+		chunkKey := fmt.Sprintf("applies/%s/chunks/%08d.log", run.ID, chunkIndex)
+		logData, err := h.blobStore.DownloadBlob(ctx, chunkKey)
+		
+		if err != nil {
+			// Chunk doesn't exist - check if apply is still running
+			if run.Status == "applied" || run.Status == "errored" {
+				// Apply is done, no more chunks coming
+				break
+			}
+			// Apply still running, this chunk doesn't exist yet
+			break
+		}
+		
+		fullLogs.Write(logData)
+		chunkIndex++
+	}
+	
+	logText = fullLogs.String()
+	
+	// If no chunks exist yet, generate default message based on status
+	if logText == "" {
 		if run.Status == "applying" || run.Status == "apply_queued" {
 			logText = "Waiting for apply to start...\n"
 		} else {
diff --git a/taco/internal/tfe/apply_executor.go b/taco/internal/tfe/apply_executor.go
@@ -147,23 +147,37 @@ func (e *ApplyExecutor) ExecuteApply(ctx context.Context, runID string) error {
 		}
 	}()
 
-	// Buffered logging to reduce blob storage roundtrips
-	applyLogBlobID := fmt.Sprintf("runs/%s/apply-logs.txt", run.ID)
+	// Chunked logging to prevent memory bloat
+	// Upload log chunks as separate S3 objects and clear buffer after each upload
+	chunkIndex := 1
 	var logBuffer bytes.Buffer
 	var logMutex sync.Mutex
 	lastLogFlush := time.Now()
-	lastFlushSize := 0
 	
+	// Flush helper - uploads current buffer as a chunk and clears it
 	flushLogs := func() error {
 		logMutex.Lock()
-		defer logMutex.Unlock()
 		if logBuffer.Len() == 0 {
+			logMutex.Unlock()
 			return nil
 		}
-		err := e.blobStore.UploadBlob(ctx, applyLogBlobID, logBuffer.Bytes())
+		// Copy buffer to avoid holding lock during upload
+		data := make([]byte, logBuffer.Len())
+		copy(data, logBuffer.Bytes())
+		currentChunk := chunkIndex
+		logMutex.Unlock()
+
+		// Upload this chunk (key includes zero-padded chunk index)
+		chunkKey := fmt.Sprintf("applies/%s/chunks/%08d.log", run.ID, currentChunk)
+		err := e.blobStore.UploadBlob(ctx, chunkKey, data)
+		
 		if err == nil {
+			logMutex.Lock()
 			lastLogFlush = time.Now()
-			lastFlushSize = logBuffer.Len()
+			// Clear buffer to free memory
+			logBuffer.Reset()
+			chunkIndex++
+			logMutex.Unlock()
 		}
 		return err
 	}
@@ -172,8 +186,8 @@ func (e *ApplyExecutor) ExecuteApply(ctx context.Context, runID string) error {
 		logMutex.Lock()
 		logBuffer.WriteString(message)
 		now := time.Now()
-		// Flush if we have >1KB of NEW data or if 1s has passed
-		shouldFlush := (logBuffer.Len()-lastFlushSize) > 1024 || now.Sub(lastLogFlush) > 1*time.Second
+		// Flush if buffer exceeds chunk size (256KB) or 1s has passed
+		shouldFlush := logBuffer.Len() > 256*1024 || now.Sub(lastLogFlush) > 1*time.Second
 		logMutex.Unlock()
 		
 		if shouldFlush {
@@ -334,7 +348,7 @@ func (e *ApplyExecutor) ExecuteApply(ctx context.Context, runID string) error {
 	if applyErr != nil {
 		runStatus = "errored"
 		logs = logs + "\n\nError: " + applyErr.Error()
-		_ = e.blobStore.UploadBlob(ctx, applyLogBlobID, []byte(logs))
+		// Error already logged via appendLog in the executor
 		if updateErr := e.runRepo.UpdateRunError(ctx, run.ID, applyErr.Error()); updateErr != nil {
 			logger.Error("failed to update run error", slog.String("error", updateErr.Error()))
 		}
@@ -351,7 +365,7 @@ func (e *ApplyExecutor) ExecuteApply(ctx context.Context, runID string) error {
 				runStatus = "errored"
 				errMsg := fmt.Sprintf("Failed to upload state: %v", uploadErr)
 				logs = logs + "\n\nCritical Error: " + errMsg + "\n"
-				_ = e.blobStore.UploadBlob(ctx, applyLogBlobID, []byte(logs))
+				// Error already logged via appendLog in the executor
 				if updateErr := e.runRepo.UpdateRunError(ctx, run.ID, errMsg); updateErr != nil {
 					logger.Error("failed to update run error", slog.String("error", updateErr.Error()))
 				}
diff --git a/taco/internal/tfe/plan.go b/taco/internal/tfe/plan.go
@@ -6,6 +6,7 @@ import (
 	"net/http"
 	"os"
 	"strconv"
+	"strings"
 	"time"
 
 	"github.com/diggerhq/digger/opentaco/internal/auth"
@@ -104,20 +105,34 @@ func (h *TfeHandler) GetPlanLogs(c echo.Context) error {
 		return c.JSON(http.StatusNotFound, map[string]string{"error": "plan not found"})
 	}
 
-	// Check if logs exist in blob storage
+	// Read logs from chunked S3 objects
+	// Chunks are stored as plans/{planID}/chunks/00000001.log, 00000002.log, etc.
 	var logText string
-	if plan.LogBlobID != nil {
-		// Try to get logs from blob storage
-		logData, err := h.blobStore.DownloadBlob(ctx, *plan.LogBlobID)
+	chunkIndex := 1
+	var fullLogs strings.Builder
+	
+	for {
+		chunkKey := fmt.Sprintf("plans/%s/chunks/%08d.log", planID, chunkIndex)
+		logData, err := h.blobStore.DownloadBlob(ctx, chunkKey)
+		
 		if err != nil {
-			fmt.Printf("Failed to get logs from blob storage: %v\n", err)
-			// Fall back to default logs
-			logText = generateDefaultPlanLogs(plan)
-		} else {
-			logText = string(logData)
+			// Chunk doesn't exist - check if plan is still running
+			if plan.Status == "finished" || plan.Status == "errored" {
+				// Plan is done, no more chunks coming
+				break
+			}
+			// Plan still running, this chunk doesn't exist yet
+			break
 		}
-	} else {
-		// Generate default logs based on plan status
+		
+		fullLogs.Write(logData)
+		chunkIndex++
+	}
+	
+	logText = fullLogs.String()
+	
+	// If no chunks exist yet, generate default logs based on status
+	if logText == "" {
 		logText = generateDefaultPlanLogs(plan)
 	}
 
diff --git a/taco/internal/tfe/plan_executor.go b/taco/internal/tfe/plan_executor.go
@@ -185,37 +185,50 @@ func (e *PlanExecutor) ExecutePlan(ctx context.Context, runID string) error {
 		}
 	}()
 
-	// Buffered logging to reduce blob storage roundtrips
-	// Instead of download-append-upload on each message, we accumulate in memory
-	// and flush periodically (every 1KB or 5 seconds)
-	logBlobID := fmt.Sprintf("plans/%s/logs.txt", *run.PlanID)
+	// Chunked logging to prevent memory bloat
+	// Upload log chunks as separate S3 objects and clear buffer after each upload
+	// This keeps memory usage bounded regardless of total log size
+	chunkIndex := 1
 	var logBuffer bytes.Buffer
 	var logMutex sync.Mutex
 	lastLogFlush := time.Now()
-	lastFlushSize := 0
 	
-	// Flush helper - uploads current buffer to blob storage
+	// Flush helper - uploads current buffer as a chunk and clears it
 	flushLogs := func() error {
 		logMutex.Lock()
-		defer logMutex.Unlock()
 		if logBuffer.Len() == 0 {
+			logMutex.Unlock()
 			return nil
 		}
-		err := e.blobStore.UploadBlob(ctx, logBlobID, logBuffer.Bytes())
+		// Copy buffer to avoid holding lock during upload
+		data := make([]byte, logBuffer.Len())
+		copy(data, logBuffer.Bytes())
+		currentChunk := chunkIndex
+		logMutex.Unlock()
+
+		// Upload this chunk (key includes zero-padded chunk index)
+		chunkKey := fmt.Sprintf("plans/%s/chunks/%08d.log", *run.PlanID, currentChunk)
+		err := e.blobStore.UploadBlob(ctx, chunkKey, data)
+		
 		if err == nil {
+			logMutex.Lock()
 			lastLogFlush = time.Now()
-			lastFlushSize = logBuffer.Len()
+			// Clear buffer to free memory (this is the key fix for memory bloat!)
+			logBuffer.Reset()
+			chunkIndex++
+			logMutex.Unlock()
 		}
 		return err
 	}
 	
+	
 	// Buffered append - only uploads when buffer is large or time has elapsed
 	appendLog := func(message string) {
 		logMutex.Lock()
 		logBuffer.WriteString(message)
 		now := time.Now()
-		// Flush if we have >1KB of NEW data or if 1s has passed
-		shouldFlush := (logBuffer.Len()-lastFlushSize) > 1024 || now.Sub(lastLogFlush) > 1*time.Second
+		// Flush if buffer exceeds chunk size (256KB) or 1s has passed
+		shouldFlush := logBuffer.Len() > 256*1024 || now.Sub(lastLogFlush) > 1*time.Second
 		logMutex.Unlock()
 		
 		if shouldFlush {
@@ -235,17 +248,8 @@ func (e *PlanExecutor) ExecutePlan(ctx context.Context, runID string) error {
 	}
 	logger.Info("updated run status to planning")
 
-	// Update plan with LogBlobID immediately so API can stream logs
-	// This restores the domain pattern where the DB is the source of truth
-	if run.PlanID != nil {
-		planUpdates := &domain.TFEPlanUpdate{
-			LogBlobID: &logBlobID,
-		}
-		if err := e.planRepo.UpdatePlan(ctx, *run.PlanID, planUpdates); err != nil {
-			logger.Warn("failed to update plan with log blob ID", slog.String("error", err.Error()))
-			// Non-fatal, continue
-		}
-	}
+	// Note: We no longer set LogBlobID since we use chunked logging
+	// The API reads chunks directly from plans/{planID}/chunks/*.log
 
 	appendLog("Preparing terraform run...\n")
 
@@ -473,7 +477,7 @@ func (e *PlanExecutor) ExecutePlan(ctx context.Context, runID string) error {
 		ResourceChanges:      &changes,
 		ResourceDestructions: &destroys,
 		HasChanges:           &hasChanges,
-		LogBlobID:            &logBlobID,
+		// LogBlobID removed - we use chunked logging now
 		LogReadURL:           &logReadURL,
 	}
 	if len(planJSON) > 0 {