Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 214 additions & 0 deletions scrape_github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#!/usr/bin/env python3
"""
github_ai_theme_watcher.py

Veille thématique GitHub orientée IA — recherche de projets par thème (ex: "LLM", "diffusion", "RAG", ...)
Stocke des résultats synthétiques dans une base SQLite pour consommation par un dashboard / newsletter / alertes.

Usage:
python github_ai_theme_watcher.py # tourne en continu (sleep INTERVAL)
python github_ai_theme_watcher.py --once # exécute une seule itération (utile pour cron/tests)

Configure via variables en tête du fichier ou via variables d'environnement:
- GITHUB_TOKEN: token (optionnel mais recommandé)
"""

import os
import sys
import time
import sqlite3
import requests
import argparse
from datetime import datetime
from typing import List


THEMES = [
"large-language-model",
"llm",
"transformer",
"text-generation",
"retrieval-augmented-generation",
"rag",
"agents",
"chatbot",
"fine-tuning",
"quantization",
"lora",
"peft",
"diffusion",
"stable-diffusion",
"image-generation",
"multimodal",
"speech-to-text",
"speech-synthesis",
"audio",
"reinforcement-learning",
"computer-vision",
]

RESULTS_PER_THEME = 20

INTERVAL = int(os.getenv("GITHUB_WATCHER_INTERVAL", 21600))

DB_FILE = os.path.join(os.path.dirname(__file__), "github_ai_trending.db")

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

HEADERS = {
"Accept": "application/vnd.github+json",
"User-Agent": "github-ai-theme-watcher/1.0"
}
if GITHUB_TOKEN:
HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"

conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS trending_ai_projects (
full_name TEXT PRIMARY KEY,
name TEXT,
description TEXT,
stars INTEGER,
language TEXT,
theme TEXT,
updated_at TEXT,
html_url TEXT,
last_seen TIMESTAMP
)
""")
conn.commit()

cur.execute("""
CREATE TABLE IF NOT EXISTS project_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
full_name TEXT,
stars INTEGER,
updated_at TEXT,
captured_at TIMESTAMP
)
""")
conn.commit()


def search_github_repos(query: str, per_page: int = RESULTS_PER_THEME) -> List[dict]:
"""
Recherche des repositories GitHub via l'API Search.
`query` doit être la Q de recherche (ex: "transformer language:python").
"""
url = "https://api.github.com/search/repositories"
params = {
"q": query,
"sort": "stars",
"order": "desc",
"per_page": per_page
}
resp = requests.get(url, headers=HEADERS, params=params, timeout=20)
if resp.status_code == 403:
retry_after = resp.headers.get("Retry-After")
raise RateLimitError(retry_after=int(retry_after) if retry_after and retry_after.isdigit() else None)
resp.raise_for_status()
data = resp.json()
return data.get("items", [])

def sanitize_text(s):
if s is None:
return ""
return str(s)

def save_project(repo: dict, theme: str):
"""INSERT OR REPLACE de l'enregistrement principal + ajout historique."""
full_name = repo.get("full_name")
name = repo.get("name")
desc = sanitize_text(repo.get("description"))
stars = repo.get("stargazers_count", 0)
language = repo.get("language") or ""
updated_at = repo.get("updated_at") or repo.get("pushed_at") or datetime.utcnow().isoformat()
html_url = repo.get("html_url") or f"https://github.com/{full_name}"
now = datetime.utcnow().isoformat()

cur.execute("""
INSERT OR REPLACE INTO trending_ai_projects
(full_name, name, description, stars, language, theme, updated_at, html_url, last_seen)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (full_name, name, desc, stars, language, theme, updated_at, html_url, now))
conn.commit()

cur.execute("""
INSERT INTO project_history (full_name, stars, updated_at, captured_at)
VALUES (?, ?, ?, ?)
""", (full_name, stars, updated_at, now))
conn.commit()


class RateLimitError(Exception):
def __init__(self, retry_after=None):
self.retry_after = retry_after
super().__init__("Rate limit hit on GitHub API. Retry after: {}".format(retry_after))


def build_query_for_theme(theme: str) -> str:
theme_token = theme.replace(" ", "+")
q = f"{theme_token} in:name,description,readme stars:>50"

return q

def run_once(themes=THEMES):
print(f"[{datetime.utcnow().isoformat()}] Démarrage d'une itération de veille (thèmes: {len(themes)})")
total_saved = 0
for theme in themes:
try:
q = build_query_for_theme(theme)
print(f"-> Recherche thème '{theme}' (q={q})")
items = search_github_repos(q)
print(f" ↳ {len(items)} résultats récupérés pour '{theme}'")
for repo in items:
save_project(repo, theme)
total_saved += 1
except RateLimitError as rle:
wait = rle.retry_after or 60
print(f"[RATE LIMIT] Limit atteint. Pause {wait} secondes.")
time.sleep(wait)
except Exception as e:
print(f"[ERREUR] thème '{theme}': {e}")
print(f"[{datetime.utcnow().isoformat()}] Itération terminée — {total_saved} enregistrements traités.")
return total_saved

def main_loop(interval=INTERVAL, once=False):
if once:
run_once()
return

try:
while True:
run_once()
print(f"Attente {interval} secondes avant la prochaine vérification...")
time.sleep(interval)
except KeyboardInterrupt:
print("")
finally:
conn.close()

def parse_args():
p = argparse.ArgumentParser(description="Veille thématique GitHub orientée IA")
p.add_argument("--once", action="store_true", help="Exécuter une unique itération et quitter")
p.add_argument("--interval", type=int, default=INTERVAL, help="Intervalle entre itérations (secondes)")
p.add_argument("--themes", type=str, help="Liste de thèmes séparés par des virgules (remplace la config)")
return p.parse_args()

if __name__ == "__main__":
args = parse_args()
if args.themes:
THEMES = [t.strip() for t in args.themes.split(",") if t.strip()]
print(f"Themes remplacés: {THEMES}")

INTERVAL = args.interval

print("Github AI Theme Watcher démarré.")
if GITHUB_TOKEN:
print("")
else:
print("")

main_loop(interval=INTERVAL, once=args.once)
146 changes: 146 additions & 0 deletions scrape_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import os
import time
import sqlite3
import requests
from datetime import datetime

INTERVAL = 300

DB_FILE = os.path.join(os.path.dirname(__file__), "huggingface_hub.db")
conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS hubs (
id TEXT PRIMARY KEY,
name TEXT,
author TEXT,
likes INTEGER,
downloads INTEGER,
task TEXT,
last_modified TEXT,
type TEXT,
url TEXT
)
""")
conn.commit()


def fetch_models():
"""Récupère les modèles récents via l’API Hugging Face"""
url = "https://huggingface.co/api/models?sort=lastModified&direction=-1&limit=20"
r = requests.get(url, timeout=20)
r.raise_for_status()
return r.json()

def fetch_datasets():
"""Récupère les datasets récents"""
url = "https://huggingface.co/api/datasets?sort=lastModified&direction=-1&limit=20"
r = requests.get(url, timeout=20)
r.raise_for_status()
return r.json()

def fetch_spaces():
"""Récupère les Spaces récents"""
url = "https://huggingface.co/api/spaces?sort=lastModified&direction=-1&limit=20"
r = requests.get(url, timeout=20)
r.raise_for_status()
return r.json()

def fetch_collections():
"""Récupère les collections récentes"""
url = "https://huggingface.co/api/collections?sort=lastModified&direction=-1&limit=20"
r = requests.get(url, timeout=20)
if r.status_code == 404:
return []
r.raise_for_status()
return r.json()

def fetch_organizations():
"""Récupère les organisations récentes"""
url = "https://huggingface.co/api/organizations?limit=20"
r = requests.get(url, timeout=20)
r.raise_for_status()
return r.json()

def fetch_papers():
"""Récupère les papiers de recherche récents (si API accessible)"""
url = "https://huggingface.co/api/papers?sort=lastModified&direction=-1&limit=20"
try:
r = requests.get(url, timeout=20)
if r.status_code == 404:
return []
r.raise_for_status()
return r.json()
except Exception:
return []


def save_item(item, item_type):
cur.execute("""
INSERT OR IGNORE INTO hubs (id, name, author, likes, downloads, task, last_modified, type, url)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
item.get("id"),
item.get("name") or item.get("modelId") or item.get("id"),
item.get("author") or item.get("organization", ""),
item.get("likes", 0),
item.get("downloads", 0),
", ".join(item.get("pipeline_tag", "")) if isinstance(item.get("pipeline_tag"), list) else item.get("pipeline_tag", ""),
item.get("lastModified") or item.get("last_modified") or datetime.utcnow().isoformat(),
item_type,
build_url(item, item_type)
))
conn.commit()

def build_url(item, item_type):
"""Construit l’URL publique de l’élément"""
base = "https://huggingface.co"
if item_type in ("model", "dataset", "space", "collection", "organization"):
return f"{base}/{item.get('id')}"
elif item_type == "paper":
return f"{base}/papers/{item.get('id')}"
return base

def load_seen_ids():
cur.execute("SELECT id FROM hubs")
return set(row[0] for row in cur.fetchall())


def main():
print("Initialisation Hugging Face Hub Watcher...")
seen_ids = load_seen_ids()
print(f"{len(seen_ids)} éléments déjà enregistrés.")

fetchers = [
("model", fetch_models),
("dataset", fetch_datasets),
("space", fetch_spaces),
("collection", fetch_collections),
("organization", fetch_organizations),
("paper", fetch_papers)
]

try:
while True:
for item_type, fetch_func in fetchers:
try:
items = fetch_func()
for item in items:
item_id = item.get("id")
if item_id and item_id not in seen_ids:
print(f"[NOUVEAU {item_type.upper()}] {item_id}")
save_item(item, item_type)
seen_ids.add(item_id)
except Exception as e:
print(f"[ERREUR] {item_type}: {e}")

print(f"Attente {INTERVAL}s avant prochaine vérification...\n")
time.sleep(INTERVAL)
except KeyboardInterrupt:
print("Arrêt manuel.")
finally:
conn.close()

if __name__ == "__main__":
main()
Loading