Skip to content

Commit 4d2a116

Browse files
committed
feat: Phase 7 - AI self healing with GPT-4 and Slack
1 parent 48ae795 commit 4d2a116

File tree

4 files changed

+202
-0
lines changed

4 files changed

+202
-0
lines changed

ai-healer/Dockerfile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
FROM python:3.11-slim
2+
WORKDIR /app
3+
RUN apt-get update && apt-get install -y wget curl && rm -rf /var/lib/apt/lists/*
4+
COPY requirements.txt .
5+
RUN pip install --no-cache-dir -r requirements.txt
6+
COPY src/ .
7+
RUN python -c "import flask; import openai; import kubernetes; print('All imports OK')"
8+
EXPOSE 5010
9+
CMD ["gunicorn", "--workers", "1", "--bind", "0.0.0.0:5010", "--timeout", "120", "healer:app"]

ai-healer/requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
flask==3.0.3
2+
openai==0.28.0
3+
kubernetes==29.0.0
4+
requests==2.31.0
5+
gunicorn==22.0.0

ai-healer/src/healer.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
from flask import Flask, request, jsonify
2+
import requests
3+
import os
4+
import json
5+
import logging
6+
import openai
7+
8+
logging.basicConfig(level=logging.INFO)
9+
logger = logging.getLogger(__name__)
10+
11+
app = Flask(__name__)
12+
13+
openai.api_key = os.getenv("OPENAI_API_KEY")
14+
SLACK_WEBHOOK = os.getenv("SLACK_WEBHOOK_URL")
15+
16+
17+
def ask_gpt4(pod_name, logs):
18+
prompt = f"""
19+
Kubernetes pod '{pod_name}' has crashed.
20+
Last 50 lines of logs:
21+
{logs}
22+
Respond ONLY in valid JSON:
23+
{{
24+
"diagnosis": "what went wrong in 1-2 sentences",
25+
"root_cause": "specific technical cause",
26+
"fix": "restart_pod or manual_review",
27+
"confidence": 0.95
28+
}}
29+
"""
30+
response = openai.ChatCompletion.create(
31+
model="gpt-3.5-turbo",
32+
messages=[
33+
{"role": "system", "content": "You are a Kubernetes expert. Respond in valid JSON only."},
34+
{"role": "user", "content": prompt}
35+
],
36+
temperature=0.1
37+
)
38+
raw = response.choices[0].message.content
39+
raw = raw.replace("```json", "").replace("```", "").strip()
40+
return json.loads(raw)
41+
42+
43+
def send_slack(pod_name, diagnosis, fix_applied, confidence):
44+
if fix_applied:
45+
color = "#36a64f"
46+
status = "✅ Auto-fixed by AI"
47+
else:
48+
color = "#ff0000"
49+
status = "🚨 Needs manual review"
50+
51+
message = {
52+
"attachments": [{
53+
"color": color,
54+
"title": f"🤖 AI Healer — {pod_name}",
55+
"fields": [
56+
{"title": "Status", "value": status, "short": True},
57+
{"title": "Confidence", "value": f"{int(confidence * 100)}%", "short": True},
58+
{"title": "Diagnosis", "value": diagnosis.get("diagnosis", "Unknown"),"short": False},
59+
{"title": "Root Cause", "value": diagnosis.get("root_cause","Unknown"),"short": False},
60+
{"title": "Action", "value": diagnosis.get("fix", "None"), "short": False}
61+
],
62+
"footer": "AI DevOps Platform"
63+
}]
64+
}
65+
try:
66+
requests.post(SLACK_WEBHOOK, json=message)
67+
logger.info("Slack notification sent")
68+
except Exception as e:
69+
logger.error(f"Slack failed: {e}")
70+
71+
72+
@app.route('/health')
73+
def health():
74+
return jsonify({"status": "healthy", "service": "ai-healer"}), 200
75+
76+
77+
@app.route('/webhook', methods=['POST'])
78+
def handle_alert():
79+
data = request.get_json()
80+
if not data:
81+
return jsonify({"error": "No data"}), 400
82+
83+
for alert in data.get('alerts', []):
84+
labels = alert.get('labels', {})
85+
pod_name = labels.get('pod', 'unknown-pod')
86+
namespace = labels.get('namespace', 'default')
87+
88+
logger.info(f"Processing alert for pod: {pod_name}")
89+
90+
# Simulate logs since we are running locally
91+
logs = f"Pod {pod_name} in namespace {namespace} crashed unexpectedly."
92+
93+
try:
94+
diagnosis = ask_gpt4(pod_name, logs)
95+
except Exception as e:
96+
logger.error(f"GPT call failed: {e}")
97+
diagnosis = {
98+
"diagnosis": "AI analysis failed",
99+
"root_cause": str(e),
100+
"fix": "manual_review",
101+
"confidence": 0.0
102+
}
103+
104+
confidence = diagnosis.get('confidence', 0.0)
105+
fix_applied = False
106+
107+
send_slack(pod_name, diagnosis, fix_applied, confidence)
108+
109+
return jsonify({"status": "processed"}), 200
110+
111+
112+
if __name__ == '__main__':
113+
app.run(host='0.0.0.0', port=5010)

k8s/ai-healer.yaml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: ai-healer
5+
spec:
6+
replicas: 1
7+
selector:
8+
matchLabels:
9+
app: ai-healer
10+
template:
11+
metadata:
12+
labels:
13+
app: ai-healer
14+
spec:
15+
serviceAccountName: ai-healer-sa
16+
containers:
17+
- name: ai-healer
18+
image: arunrox47/ai-healer:latest
19+
ports:
20+
- containerPort: 5010
21+
env:
22+
- name: OPENAI_API_KEY
23+
valueFrom:
24+
secretKeyRef:
25+
name: ai-healer-secrets
26+
key: OPENAI_API_KEY
27+
- name: SLACK_WEBHOOK_URL
28+
valueFrom:
29+
secretKeyRef:
30+
name: ai-healer-secrets
31+
key: SLACK_WEBHOOK_URL
32+
readinessProbe:
33+
httpGet:
34+
path: /health
35+
port: 5010
36+
initialDelaySeconds: 10
37+
periodSeconds: 5
38+
---
39+
apiVersion: v1
40+
kind: Service
41+
metadata:
42+
name: ai-healer
43+
spec:
44+
selector:
45+
app: ai-healer
46+
ports:
47+
- port: 5010
48+
targetPort: 5010
49+
---
50+
apiVersion: v1
51+
kind: ServiceAccount
52+
metadata:
53+
name: ai-healer-sa
54+
---
55+
apiVersion: rbac.authorization.k8s.io/v1
56+
kind: ClusterRole
57+
metadata:
58+
name: ai-healer-role
59+
rules:
60+
- apiGroups: [""]
61+
resources: ["pods", "pods/log"]
62+
verbs: ["get", "list", "delete", "watch"]
63+
---
64+
apiVersion: rbac.authorization.k8s.io/v1
65+
kind: ClusterRoleBinding
66+
metadata:
67+
name: ai-healer-binding
68+
roleRef:
69+
apiGroup: rbac.authorization.k8s.io
70+
kind: ClusterRole
71+
name: ai-healer-role
72+
subjects:
73+
- kind: ServiceAccount
74+
name: ai-healer-sa
75+
namespace: default

0 commit comments

Comments
 (0)