Skip to content

Commit 02587ae

Browse files
authored
Downgrade errors to warnings when some downloads fail but at least one succeeds, and improve error logging (#134)
1 parent 6dd7fce commit 02587ae

File tree

3 files changed

+124
-37
lines changed

3 files changed

+124
-37
lines changed

cmd/execute/execute.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,11 @@ func run(cfg *Config) error {
295295
if err != nil {
296296
return fmt.Errorf("failed to get run: %w", err)
297297
}
298-
actions.DownloadRunOutput(client, run, cfg.NodesToDownload, []string{}, cfg.OutputDirectory)
298+
results, err := actions.DownloadRunOutput(client, run, cfg.NodesToDownload, []string{}, cfg.OutputDirectory)
299+
if err != nil {
300+
return fmt.Errorf("failed to download run outputs: %w", err)
301+
}
302+
actions.PrintDownloadResults(results)
299303
}
300304

301305
return nil

cmd/output/output.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,11 @@ func run(cfg *Config) error {
8484
path := cfg.GetOutputPath()
8585

8686
for _, run := range runs {
87-
if err := actions.DownloadRunOutput(client, &run, nodes, files, path); err != nil {
87+
results, err := actions.DownloadRunOutput(client, &run, nodes, files, path)
88+
if err != nil {
8889
return fmt.Errorf("failed to download run output: %w", err)
8990
}
91+
actions.PrintDownloadResults(results)
9092
}
9193
return nil
9294
}

pkg/actions/output.go

Lines changed: 116 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,56 +2,105 @@ package actions
22

33
import (
44
"context"
5-
"errors"
65
"fmt"
6+
"os"
77
"sync"
88

99
"github.com/google/uuid"
1010
"github.com/trickest/trickest-cli/pkg/filesystem"
1111
"github.com/trickest/trickest-cli/pkg/trickest"
1212
)
1313

14-
func DownloadRunOutput(client *trickest.Client, run *trickest.Run, nodes []string, files []string, destinationPath string) error {
14+
// DownloadResult represents the result of downloading outputs for a single subjob
15+
type DownloadResult struct {
16+
SubJobName string
17+
FileName string
18+
Success bool
19+
Error error
20+
}
21+
22+
func PrintDownloadResults(results []DownloadResult) {
23+
successCount := 0
24+
failureCount := 0
25+
for _, result := range results {
26+
if result.Success {
27+
successCount++
28+
} else {
29+
failureCount++
30+
if result.FileName != "" {
31+
fmt.Fprintf(os.Stderr, "Warning: Failed to download file %q for node %q: %v\n", result.FileName, result.SubJobName, result.Error)
32+
} else {
33+
fmt.Fprintf(os.Stderr, "Warning: Failed to download output for node %q: %v\n", result.SubJobName, result.Error)
34+
}
35+
}
36+
}
37+
38+
if failureCount > 0 {
39+
fmt.Fprintf(os.Stderr, "Download completed with %d successful and %d failed downloads\n", successCount, failureCount)
40+
} else if successCount > 0 {
41+
fmt.Printf("Successfully downloaded outputs for %d sub-jobs\n", successCount)
42+
}
43+
}
44+
45+
func DownloadRunOutput(client *trickest.Client, run *trickest.Run, nodes []string, files []string, destinationPath string) ([]DownloadResult, error) {
1546
if run.Status == "PENDING" || run.Status == "SUBMITTED" {
16-
return fmt.Errorf("run %s has not started yet (status: %s)", run.ID.String(), run.Status)
47+
return nil, fmt.Errorf("run %s has not started yet (status: %s)", run.ID.String(), run.Status)
1748
}
1849

1950
ctx := context.Background()
2051

2152
subJobs, err := client.GetSubJobs(ctx, *run.ID)
2253
if err != nil {
23-
return fmt.Errorf("failed to get subjobs for run %s: %w", run.ID.String(), err)
54+
return nil, fmt.Errorf("failed to get subjobs for run %s: %w", run.ID.String(), err)
2455
}
2556

2657
version, err := client.GetWorkflowVersion(ctx, *run.WorkflowVersionInfo)
2758
if err != nil {
28-
return fmt.Errorf("could not get workflow version for run %s: %w", run.ID.String(), err)
59+
return nil, fmt.Errorf("could not get workflow version for run %s: %w", run.ID.String(), err)
2960
}
3061
subJobs = trickest.LabelSubJobs(subJobs, *version)
3162

3263
matchingSubJobs, err := trickest.FilterSubJobs(subJobs, nodes)
3364
if err != nil {
34-
return fmt.Errorf("no completed node outputs matching your query were found in the run %s: %w", run.ID.String(), err)
65+
return nil, fmt.Errorf("no completed node outputs matching your query were found in the run %s: %w", run.ID.String(), err)
3566
}
3667

3768
runDir, err := filesystem.CreateRunDir(destinationPath, *run)
3869
if err != nil {
39-
return fmt.Errorf("failed to create directory for run %s: %w", run.ID.String(), err)
70+
return nil, fmt.Errorf("failed to create directory for run %s: %w", run.ID.String(), err)
4071
}
4172

73+
var allResults []DownloadResult
74+
var errCount int
4275
for _, subJob := range matchingSubJobs {
4376
isModule := version.Data.Nodes[subJob.Name].Type == "WORKFLOW"
44-
if err := downloadSubJobOutput(client, runDir, &subJob, files, run.ID, isModule); err != nil {
45-
return fmt.Errorf("failed to download output for node %s: %w", subJob.Label, err)
77+
results := downloadSubJobOutput(client, runDir, &subJob, files, run.ID, isModule)
78+
79+
for _, result := range results {
80+
if !result.Success {
81+
errCount++
82+
}
4683
}
84+
allResults = append(allResults, results...)
4785
}
4886

49-
return nil
87+
// If all subjobs failed, return an error
88+
if errCount == len(allResults) && len(allResults) > 0 {
89+
return allResults, fmt.Errorf("failed to download outputs for all nodes")
90+
}
91+
92+
// If only some failed, return results but no error
93+
return allResults, nil
5094
}
5195

52-
func downloadSubJobOutput(client *trickest.Client, savePath string, subJob *trickest.SubJob, files []string, runID *uuid.UUID, isModule bool) error {
96+
func downloadSubJobOutput(client *trickest.Client, savePath string, subJob *trickest.SubJob, files []string, runID *uuid.UUID, isModule bool) []DownloadResult {
5397
if !subJob.TaskGroup && subJob.Status != "SUCCEEDED" {
54-
return fmt.Errorf("subjob %s (ID: %s) is not completed (status: %s)", subJob.Label, subJob.ID, subJob.Status)
98+
return []DownloadResult{{
99+
SubJobName: subJob.Label,
100+
FileName: "",
101+
Success: false,
102+
Error: fmt.Errorf("subjob %s (ID: %s) is not completed (status: %s)", subJob.Label, subJob.ID, subJob.Status),
103+
}}
55104
}
56105

57106
if subJob.TaskGroup {
@@ -61,18 +110,28 @@ func downloadSubJobOutput(client *trickest.Client, savePath string, subJob *tric
61110
return downloadSingleSubJobOutput(client, savePath, subJob, files, runID, isModule)
62111
}
63112

64-
func downloadTaskGroupOutput(client *trickest.Client, savePath string, subJob *trickest.SubJob, files []string, runID *uuid.UUID) error {
113+
func downloadTaskGroupOutput(client *trickest.Client, savePath string, subJob *trickest.SubJob, files []string, runID *uuid.UUID) []DownloadResult {
65114
ctx := context.Background()
66115
children, err := client.GetChildSubJobs(ctx, subJob.ID)
67116
if err != nil {
68-
return fmt.Errorf("could not get child subjobs for subjob %s (ID: %s): %w", subJob.Label, subJob.ID, err)
117+
return []DownloadResult{{
118+
SubJobName: subJob.Label,
119+
FileName: "",
120+
Success: false,
121+
Error: fmt.Errorf("could not get child subjobs for subjob %s (ID: %s): %w", subJob.Label, subJob.ID, err),
122+
}}
69123
}
70124
if len(children) == 0 {
71-
return fmt.Errorf("no child subjobs found for subjob %s (ID: %s)", subJob.Label, subJob.ID)
125+
return []DownloadResult{{
126+
SubJobName: subJob.Label,
127+
FileName: "",
128+
Success: false,
129+
Error: fmt.Errorf("no child subjobs found for subjob %s (ID: %s)", subJob.Label, subJob.ID),
130+
}}
72131
}
73132

74133
var mu sync.Mutex
75-
var errs []error
134+
var results []DownloadResult
76135
var wg sync.WaitGroup
77136
sem := make(chan struct{}, 5)
78137

@@ -86,51 +145,73 @@ func downloadTaskGroupOutput(client *trickest.Client, savePath string, subJob *t
86145
child, err := client.GetChildSubJob(ctx, subJob.ID, i)
87146
if err != nil {
88147
mu.Lock()
89-
errs = append(errs, fmt.Errorf("could not get child %d subjobs for subjob %s (ID: %s): %w", i, subJob.Label, subJob.ID, err))
148+
results = append(results, DownloadResult{
149+
SubJobName: fmt.Sprintf("%d-%s", i, subJob.Label),
150+
FileName: "",
151+
Success: false,
152+
Error: fmt.Errorf("could not get child %d subjobs for subjob %s (ID: %s): %w", i, subJob.Label, subJob.ID, err),
153+
})
90154
mu.Unlock()
91155
return
92156
}
93157

94158
child.Label = fmt.Sprintf("%d-%s", i, subJob.Label)
95-
if err := downloadSubJobOutput(client, savePath, &child, files, runID, false); err != nil {
96-
mu.Lock()
97-
errs = append(errs, err)
98-
mu.Unlock()
99-
}
159+
childResults := downloadSubJobOutput(client, savePath, &child, files, runID, false)
160+
161+
mu.Lock()
162+
results = append(results, childResults...)
163+
mu.Unlock()
100164
}(i)
101165
}
102166
wg.Wait()
103167

104-
if len(errs) > 0 {
105-
return fmt.Errorf("errors occurred while downloading subjob children outputs:\n%s", errors.Join(errs...))
106-
}
107-
return nil
168+
return results
108169
}
109170

110-
func downloadSingleSubJobOutput(client *trickest.Client, savePath string, subJob *trickest.SubJob, files []string, runID *uuid.UUID, isModule bool) error {
171+
func downloadSingleSubJobOutput(client *trickest.Client, savePath string, subJob *trickest.SubJob, files []string, runID *uuid.UUID, isModule bool) []DownloadResult {
111172
ctx := context.Background()
112-
var errs []error
113173

114174
subJobOutputs, err := getSubJobOutputs(client, ctx, subJob, runID, isModule)
115175
if err != nil {
116-
return err
176+
return []DownloadResult{{
177+
SubJobName: subJob.Label,
178+
FileName: "",
179+
Success: false,
180+
Error: err,
181+
}}
117182
}
118183

119184
subJobOutputs = filterSubJobOutputsByFileNames(subJobOutputs, files)
120185
if len(subJobOutputs) == 0 {
121-
return fmt.Errorf("no matching output files found for subjob %s (ID: %s)", subJob.Label, subJob.ID)
186+
return []DownloadResult{{
187+
SubJobName: subJob.Label,
188+
FileName: "",
189+
Success: false,
190+
Error: fmt.Errorf("no matching output files found for subjob %s (ID: %s)", subJob.Label, subJob.ID),
191+
}}
122192
}
123193

194+
var results []DownloadResult
195+
124196
for _, output := range subJobOutputs {
125197
if err := downloadOutput(client, savePath, subJob, output); err != nil {
126-
errs = append(errs, err)
198+
results = append(results, DownloadResult{
199+
SubJobName: subJob.Label,
200+
FileName: output.Name,
201+
Success: false,
202+
Error: err,
203+
})
204+
} else {
205+
results = append(results, DownloadResult{
206+
SubJobName: subJob.Label,
207+
FileName: output.Name,
208+
Success: true,
209+
Error: nil,
210+
})
127211
}
128212
}
129213

130-
if len(errs) > 0 {
131-
return fmt.Errorf("errors occurred while downloading subjob outputs:\n%s", errors.Join(errs...))
132-
}
133-
return nil
214+
return results
134215
}
135216

136217
func getSubJobOutputs(client *trickest.Client, ctx context.Context, subJob *trickest.SubJob, runID *uuid.UUID, isModule bool) ([]trickest.SubJobOutput, error) {

0 commit comments

Comments
 (0)