Skip to content

Commit 4209d0a

Browse files
committed
Adding the Docker integration changes
1 parent ca94dd1 commit 4209d0a

File tree

10 files changed

+395
-51
lines changed

10 files changed

+395
-51
lines changed

.github/workflows/CI_pipeline.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,13 +171,13 @@ jobs:
171171
- name: Build project ${{ matrix.project_slug }}
172172
run: |
173173
echo "--- Building ${{ matrix.project_slug }} ---"
174-
python scripts/fetch_and_build.py --filter ${{ matrix.project_slug }}
174+
conda run -n iris python scripts/fetch_and_build.py --filter ${{ matrix.project_slug }}
175175
echo "--------------------------------"
176176
177177
- name: Generate CodeQL database for ${{ matrix.project_slug }}
178178
run: |
179179
echo "--- Generating CodeQL database for ${{ matrix.project_slug }} ---"
180-
python scripts/build_codeql_dbs.py --project ${{ matrix.project_slug }}
180+
conda run -n iris python scripts/build_codeql_dbs.py --project ${{ matrix.project_slug }}
181181
echo "----------------------------------"
182182
183183
- name: Run IRIS for ${{ matrix.project_slug }} with CWE ${{ matrix.cwe }}

data/build_cmds.csv

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
project_slug,build_cmd
2+
spring-projects__spring-security_CVE-2011-2732_2.0.6.RELEASE,mvn -q -B -DskipTests compile
3+
x-stream__xstream_CVE-2013-7285_1.4.6,mvn -q -B -DskipTests compile
4+
spring-security_CVE-2025-22223_6.4.3,"./gradlew --no-daemon -S -Dorg.gradle.dependency.verification=off -Dorg.gradle.warning.mode=none -Dorg.gradle.caching=false --rerun-tasks -x compileKotlin -x compileTestKotlin clean classes"
5+
incubator-seata_CVE-2025-32897_v2.2.0,mvn -q -B -DskipTests compile
6+
cassandra-lucene-index_CVE-2025-26511_cassandra-4.0.16-1.0.0,mvn clean package -B -V -e -Dfindbugs.skip -Dcheckstyle.skip -Dpmd.skip=true -Dspotbugs.skip -Denforcer.skip -Dmaven.javadoc.skip -DskipTests -Dmaven.test.skip.exec -Dlicense.skip=true -Drat.skip=true -Dspotless.check.skip=true
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
diff --git a/pom.xml b/pom.xml
2+
index 9d373d76..654c07a2 100644
3+
--- a/pom.xml
4+
+++ b/pom.xml
5+
@@ -25,7 +25,7 @@
6+
7+
<groupId>com.instaclustr</groupId>
8+
<artifactId>cassandra-lucene-index-parent</artifactId>
9+
- <version>4.1.8-1.0.1</version>
10+
+ <version>4.1.8-1.0.0</version>
11+
<packaging>pom</packaging>
12+
13+
<name>Cassandra Lucene index</name>

environment.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,6 @@ dependencies:
1313
- transformers
1414
- pytorch=2.5
1515
- google-generativeai
16+
- pip
17+
- pip:
18+
- docker

scripts/build_codeql_dbs.py

Lines changed: 152 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,33 @@
77
import json
88
sys.path.append(str(Path(__file__).parent.parent))
99

10-
from src.config import CODEQL_DB_PATH, PROJECT_SOURCE_CODE_DIR, IRIS_ROOT_DIR, BUILD_INFO, DEP_CONFIGS, DATA_DIR
10+
from src.config import CODEQL_DB_PATH, PROJECT_SOURCE_CODE_DIR, IRIS_ROOT_DIR, BUILD_INFO, DEP_CONFIGS, DATA_DIR, CODEQL_DIR, CVES_MAPPED_W_COMMITS_DIR
11+
from scripts.docker_utils import ensure_image, create_container, exec_in_container, parse_project_image, copy_dir_to_container, copy_from_container
1112
ALLVERSIONS = json.load(open(DEP_CONFIGS))
1213

14+
# Path to custom build commands CSV
15+
BUILD_CMDS_CSV = os.path.join(DATA_DIR, "build_cmds.csv")
16+
17+
def load_custom_build_commands(csv_path: str = BUILD_CMDS_CSV) -> dict[str, str]:
18+
"""
19+
Load custom build commands from the build command CSV file
20+
Returns a mapping of project_slug -> build_cmd. Missing file or invalid headers yields an empty mapping.
21+
"""
22+
build_cmds: dict[str, str] = {}
23+
if not os.path.exists(csv_path):
24+
return build_cmds
25+
with open(csv_path, "r", newline="") as f:
26+
reader = csv.DictReader(f)
27+
for row in reader:
28+
slug = (row.get("project_slug") or "").strip()
29+
cmd = (row.get("build_cmd") or "").strip()
30+
if slug and cmd:
31+
build_cmds[slug] = cmd
32+
return build_cmds
33+
34+
# Custom build commands for the CodeQL database creation (loaded from CSV)
35+
CUSTOM_BUILD_COMMANDS: dict[str, str] = load_custom_build_commands()
36+
1337
def setup_environment(row):
1438
env = os.environ.copy()
1539

@@ -48,6 +72,9 @@ def create_codeql_database(project_slug, env, db_base_path, sources_base_path):
4872
print(f"PATH: {env.get('PATH', 'Not set')}")
4973
print(f"JAVA_HOME: {env.get('JAVA_HOME', 'Not set')}")
5074

75+
# Prefer custom build command when available
76+
custom_cmd = CUSTOM_BUILD_COMMANDS.get(project_slug)
77+
5178
try:
5279
java_version = subprocess.check_output(['java', '-version'],
5380
stderr=subprocess.STDOUT,
@@ -67,8 +94,11 @@ def create_codeql_database(project_slug, env, db_base_path, sources_base_path):
6794
database_path,
6895
"--source-root", source_path,
6996
"--language", "java",
70-
"--overwrite"
97+
"--overwrite",
7198
]
99+
if custom_cmd:
100+
print(f"Using custom build command for {project_slug}: {custom_cmd}")
101+
command.extend(["--command", custom_cmd])
72102

73103
try:
74104
print(f"Creating database at: {database_path}")
@@ -85,11 +115,121 @@ def create_codeql_database(project_slug, env, db_base_path, sources_base_path):
85115
print(f'Stderr Info:\n{e.stderr.decode()}')
86116
raise
87117

118+
119+
def create_codeql_database_in_container(project_slug: str, row: dict, db_base_path: str, verbose: bool = False) -> None:
120+
image = parse_project_image(project_slug) # Parse the project image from the project slug
121+
ensure_image(image)
122+
123+
# Add -docker suffix to database name when using container
124+
db_project_slug = f"{project_slug}-docker"
125+
126+
# Prepare host and container paths
127+
host_db_dir = os.path.abspath(db_base_path) # Create the host database directory
128+
os.makedirs(host_db_dir, exist_ok=True)
129+
container_codeql_dir = "/codeql"
130+
container_codeql_bin = f"{container_codeql_dir}/codeql"
131+
container_out_base = "/out"
132+
container_db_dir = f"{container_out_base}/{db_project_slug}"
133+
container_source_root = "/workspace/repo"
134+
135+
# Read repo info from project_info.csv
136+
def get_repo_info_from_project_info(slug: str) -> tuple[str, str]:
137+
with open(CVES_MAPPED_W_COMMITS_DIR, 'r') as f:
138+
reader = csv.reader(f)
139+
next(reader, None) # header
140+
for line in reader:
141+
if len(line) > 10 and line[1] == slug:
142+
repo_url = line[8]
143+
commit_id = line[10]
144+
return repo_url, commit_id
145+
raise RuntimeError(f"Project slug '{slug}' not found in project_info.csv")
146+
147+
container = create_container(image=image, working_dir=container_source_root)
148+
try:
149+
container.start()
150+
151+
# Copy CodeQL CLI into container and ensure out dir exists
152+
copy_dir_to_container(container, CODEQL_DIR, container_codeql_dir)
153+
exec_in_container(container, ["bash", "-lc", f"mkdir -p {container_out_base}"])
154+
155+
# Fresh fetch like fetch_one.py: reclone at desired commit
156+
repo_url, commit_id = get_repo_info_from_project_info(project_slug)
157+
fetch_cmd = (
158+
f"rm -rf repo && mkdir -p repo && cd repo && "
159+
f"git init && git remote add origin '{repo_url}' && "
160+
f"git fetch --no-tags --depth 1 origin {commit_id} && "
161+
f"git reset --hard FETCH_HEAD && "
162+
f"git fetch --no-tags --depth 1 origin '+refs/heads/*:refs/remotes/origin/*'"
163+
)
164+
print(f"Refreshing sources from {repo_url} @ {commit_id}")
165+
code, output = exec_in_container(
166+
container,
167+
["bash", "-lc", fetch_cmd],
168+
workdir="/workspace",
169+
stream=verbose,
170+
)
171+
if code != 0:
172+
if output:
173+
print(output)
174+
raise RuntimeError(f"Failed to fetch sources for {project_slug}")
175+
176+
# Apply project patch if available (mirror fetch_one.py behavior)
177+
patch_dir_host = os.path.join(DATA_DIR, "patches")
178+
patch_file_host = os.path.join(patch_dir_host, f"{project_slug}.patch")
179+
if os.path.exists(patch_file_host):
180+
print(f"Found patch for {project_slug}, applying...")
181+
# Copy entire patches dir to container to keep logic simple
182+
copy_dir_to_container(container, patch_dir_host, "/patches")
183+
code, output = exec_in_container(
184+
container,
185+
["bash", "-lc", f"git apply /patches/{project_slug}.patch"],
186+
workdir=container_source_root,
187+
stream=verbose,
188+
)
189+
if code != 0:
190+
if output:
191+
print(output)
192+
raise RuntimeError(f"Failed to apply patch for {project_slug}")
193+
else:
194+
print("No patch found; skipping patching.")
195+
196+
# Prefer custom build command when available
197+
custom_cmd = CUSTOM_BUILD_COMMANDS.get(project_slug)
198+
if custom_cmd:
199+
print(f"Using custom build command for {project_slug}: {custom_cmd}")
200+
codeql_cmd = (f"{container_codeql_bin} database create {container_db_dir} "
201+
f"--source-root {container_source_root} --language java --overwrite "
202+
f"--command \"{custom_cmd}\"")
203+
else:
204+
codeql_cmd = (f"{container_codeql_bin} database create {container_db_dir} --source-root {container_source_root} --language java --overwrite")
205+
206+
print(f"Initializing database at {container_db_dir}.")
207+
code, output = exec_in_container(container, ["bash", "-lc", codeql_cmd], workdir=container_source_root, stream=verbose)
208+
209+
if code != 0:
210+
print(f"CodeQL database creation failed for {project_slug}")
211+
if not verbose and output:
212+
print("Error output:")
213+
print(output)
214+
raise RuntimeError(f"CodeQL database creation failed in container for {project_slug}")
215+
216+
print(f"Finalizing database at {container_db_dir}.")
217+
# Copy database back to host
218+
copy_from_container(container, container_db_dir, host_db_dir)
219+
print(f"Successfully created database at {host_db_dir}/{db_project_slug}.")
220+
finally:
221+
try:
222+
container.remove(force=True)
223+
except Exception:
224+
pass
225+
88226
def main():
89227
parser = argparse.ArgumentParser(description='Create CodeQL databases for cwe-bench-java projects')
90228
parser.add_argument('--project', help='Specific project slug', default=None)
91229
parser.add_argument('--db-path', help='Base path for storing CodeQL databases', default=CODEQL_DB_PATH)
92230
parser.add_argument('--sources-path', help='Base path for project sources', default=PROJECT_SOURCE_CODE_DIR)
231+
parser.add_argument('--use-container', action='store_true', help='Create DB inside the project container using mounted CodeQL')
232+
parser.add_argument('--verbose', action='store_true', help='Show verbose output during database creation')
93233
args = parser.parse_args()
94234

95235
# Load build information
@@ -98,14 +238,20 @@ def main():
98238
if args.project:
99239
project = next((p for p in projects if p['project_slug'] == args.project), None)
100240
if project:
101-
env = setup_environment(project)
102-
create_codeql_database(project['project_slug'], env, args.db_path, args.sources_path)
241+
if args.use_container:
242+
create_codeql_database_in_container(project['project_slug'], project, args.db_path, args.verbose)
243+
else:
244+
env = setup_environment(project)
245+
create_codeql_database(project['project_slug'], env, args.db_path, args.sources_path)
103246
else:
104247
print(f"Project {args.project} not found in CSV file")
105248
else:
106249
for project in projects:
107-
env = setup_environment(project)
108-
create_codeql_database(project['project_slug'], env, args.db_path, args.sources_path)
250+
if args.use_container:
251+
create_codeql_database_in_container(project['project_slug'], project, args.db_path, args.verbose)
252+
else:
253+
env = setup_environment(project)
254+
create_codeql_database(project['project_slug'], env, args.db_path, args.sources_path)
109255

110256
# Location of build_info_local.csv file
111257
LOCAL_BUILD_INFO = os.path.join(DATA_DIR, "build-info", "build_info_local.csv")

scripts/build_one.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
sys.path.append(str(ROOT_DIR))
2828

2929
from src.config import DATA_DIR, DEP_CONFIGS
30+
from scripts.docker_utils import create_container, ensure_image, exec_in_container, parse_project_image
3031

3132
# Load dependency configurations
3233
ALLVERSIONS = json.load(open(DEP_CONFIGS))
@@ -108,11 +109,11 @@ def get_build_info_from_csv(project_slug, csv_path):
108109
for row in reader:
109110
if row['project_slug'] == project_slug and row['status'] == 'success':
110111
specific_attempt = {}
111-
if row['jdk_version'] != 'n/a':
112+
if row['jdk_version'] not in ('n/a', '', None):
112113
specific_attempt['jdk'] = row['jdk_version']
113-
if row['mvn_version'] != 'n/a':
114+
if row['mvn_version'] not in ('n/a', '', None):
114115
specific_attempt['mvn'] = row['mvn_version']
115-
if row['gradle_version'] != 'n/a':
116+
if row['gradle_version'] not in ('n/a', '', None):
116117
specific_attempt['gradle'] = row['gradle_version']
117118
if row['use_gradlew'] != 'n/a':
118119
if row['use_gradlew'] == "True":
@@ -372,7 +373,32 @@ def validate_and_create_custom_attempt(jdk, mvn, gradle, gradlew):
372373
return custom_attempt
373374

374375

375-
def build_project(project_slug, try_all=False, custom_attempt=None):
376+
def build_inside_container(project_slug: str, attempt: dict) -> bool:
377+
image = parse_project_image(project_slug)
378+
ensure_image(image)
379+
container = create_container(image=image, working_dir="/workspace/repo")
380+
try:
381+
container.start()
382+
env = {}
383+
cmd: list[str]
384+
if "mvn" in attempt:
385+
cmd = ["bash", "-lc", "mvn -B -e -U -DskipTests clean package"]
386+
elif "gradle" in attempt:
387+
cmd = ["bash", "-lc", "gradle build --parallel"]
388+
elif "gradlew" in attempt:
389+
cmd = ["bash", "-lc", "chmod +x ./gradlew && ./gradlew --no-daemon -S -Dorg.gradle.dependency.verification=off clean"]
390+
else:
391+
return False
392+
code, _ = exec_in_container(container, cmd, workdir="/workspace/repo", environment=env, stream=True)
393+
return code == 0
394+
finally:
395+
try:
396+
container.remove(force=True)
397+
except Exception:
398+
pass
399+
400+
401+
def build_project(project_slug, try_all=False, custom_attempt=None, use_container: bool = False):
376402
"""Main function to build a project with various strategies."""
377403
# Handle custom attempt first
378404
if custom_attempt:
@@ -385,12 +411,12 @@ def build_project(project_slug, try_all=False, custom_attempt=None):
385411
if not try_all:
386412
# Try local build info first
387413
local_build_info = get_build_info_from_csv(project_slug, f"{DATA_DIR}/build-info/build_info_local.csv")
388-
if local_build_info and try_build_with_attempt(project_slug, local_build_info, "local"):
414+
if local_build_info and (try_build_with_attempt(project_slug, local_build_info, "local") if not use_container else build_inside_container(project_slug, local_build_info)):
389415
return True
390416

391417
# Try global build info if local failed
392418
global_build_info = get_build_info_from_csv(project_slug, f"{DATA_DIR}/build_info.csv")
393-
if global_build_info and try_build_with_attempt(project_slug, global_build_info, "global"):
419+
if global_build_info and (try_build_with_attempt(project_slug, global_build_info, "global") if not use_container else build_inside_container(project_slug, global_build_info)):
394420
return True
395421

396422
# Try all default attempts
@@ -399,7 +425,7 @@ def build_project(project_slug, try_all=False, custom_attempt=None):
399425
"No successful build configuration found in CSV files, trying all version combinations..."))
400426

401427
for attempt in ATTEMPTS:
402-
if try_build_with_attempt(project_slug, attempt):
428+
if (try_build_with_attempt(project_slug, attempt) if not use_container else build_inside_container(project_slug, attempt)):
403429
return True
404430

405431
print(f"[build_one] All build attempts failed for {project_slug}")
@@ -451,6 +477,11 @@ def main():
451477
action="store_true",
452478
help="Use the project's gradlew script"
453479
)
480+
parser.add_argument(
481+
"--use-container",
482+
action="store_true",
483+
help="Build inside the project's container image",
484+
)
454485

455486
args = parser.parse_args()
456487

@@ -462,7 +493,7 @@ def main():
462493
return 1
463494
custom_attempt = validate_and_create_custom_attempt(args.jdk, args.mvn, args.gradle, args.gradlew)
464495

465-
success = build_project(args.project_slug, try_all=args.try_all, custom_attempt=custom_attempt)
496+
success = build_project(args.project_slug, try_all=args.try_all, custom_attempt=custom_attempt, use_container=args.use_container)
466497
return 0 if success else 1
467498

468499

0 commit comments

Comments
 (0)