diff --git a/.example.env b/.example.env new file mode 100644 index 0000000..d4719e8 --- /dev/null +++ b/.example.env @@ -0,0 +1,8 @@ +## OpenAI API Key +OPENAI_API_KEY=YOUR_OPENAI_API_KEY_HERE + +## Define the path to the system prompt file or the system prompt itself + +SYSTEM_PROMPT_PATH=./prompts/demp_prompt.txt + +# SYSTEM_PROMPT=Your system prompt goes here. \ No newline at end of file diff --git a/GnosisPages.py b/GnosisPages.py index dd1ce54..06e8eed 100644 --- a/GnosisPages.py +++ b/GnosisPages.py @@ -24,9 +24,11 @@ if "wk_button" not in st.session_state: st.session_state.wk_button = False +if "chroma_db" not in st.session_state: + st.session_state.chroma_db = ChromaDB(openai.api_key) # Build settings -chroma_db = ChromaDB(openai.api_key) +chroma_db = st.session_state.chroma_db collection = settings.build(chroma_db) # Sidebar diff --git a/README.md b/README.md index a7f524c..677e5bb 100644 --- a/README.md +++ b/README.md @@ -9,88 +9,179 @@ pinned: false --- # GnosisPages -GnosisPages is a tool that helps you to create your own knowledge base for retrieval information when interacting with a LLM. The app take advantage of the frameworks Streamlit and Langchain and uses a client-side ChromaDB. -## Features +GnosisPages is a RAG + LLM chatbot for querying private document collections. Upload PDF files, build a semantic knowledge base, and ask questions in natural language — no keyword matching required. + +**[▶ Try the live demo](https://huggingface.co/spaces/maclenn77/pdf-explainer)** · **[Watch a walkthrough](https://youtu.be/OEQTusJGHFQ)** + +--- + +## Use Case: CV Discovery for Recruiters + +Managing large volumes of CVs is difficult. Recruiters often don't know the exact technologies or skill names to search for, and keyword-based search misses semantically equivalent terms (e.g. "machine learning" vs "aprendizaje automático", or experience in a framework analogous to the one required). -GnosisPages offers you the following key features: +GnosisPages solves this with semantic retrieval: a recruiter can ask "Who has experience with distributed systems and has worked in startups?" and the system finds relevant profiles even when the exact phrasing doesn't match. -- **Upload PDF files**: Upload PDF files until 200MB size. PDF files should be programmatically created or processed by an OCR tool. -- **Extract and split text**: Extract the content of your PDF files and split them for a better querying. -- **Store in a client-side VectorDB**: GnosisPages uses ChromaDB for storing the content of your pdf files on vectors (ChromaDB use by default "all-MiniLM-L6-v2" for embeddings) -- **Consult the info of your knowledge base**: Ask questions to the Intelligent Assitant about the content of your knowledge base. The Langchain Agent will use ChromaDB query functions as a tool. +Candidate data is sensitive (contact details, personal history). GnosisPages keeps it private by design: documents live in a local or private vector database, and the LLM never trains on them — it only reads the retrieved context at inference time. -## Demo +The demo ships with a pre-loaded collection of synthetic CVs generated with Claude Sonnet 4.6 and vectorized with OpenAI's `text-embedding-3-small`. -[Try the GnosisPages's demo](https://huggingface.co/spaces/maclenn77/pdf-explainer)!!! +--- + +## Data flow + +``` +PDF documents + │ + ▼ + Text extraction (PyMuPDF) + │ + ▼ + Chunking (LangChain TextSplitter) + │ + ▼ + Embedding (text-embedding-3-small · OpenAI) + │ + ▼ + Vector storage (ChromaDB) + │ + ▼ + User query (natural language) + │ + ├─► Embed query ──► Semantic search (ChromaDB cosine similarity) + │ │ + │ Top-k chunks + │ │ + └─────────────────► Prompt construction + │ + GPT-4o-Mini (LangChain) + │ + Answer → Streamlit UI +``` -[Watch a demo here](https://youtu.be/OEQTusJGHFQ) ## Architecture -![schematic-1](https://github.com/Maclenn77/pdf-explainer/assets/1808402/36dbacfa-43f3-4530-9d31-0e9b1127f992) +image -## Prerrequisites +### Components -For using the demo, you only need an OpenAI API Key. +| Layer | Technology | Role | +|---|---|---| +| UI | Streamlit 1.58 | Web interface and file upload | +| Orchestration | LangChain 0.3 | RAG chain, prompt management | +| Vector store | ChromaDB 1.5 | Semantic storage and retrieval | +| Embeddings | `text-embedding-3-small` (OpenAI) | Document and query vectorization | +| LLM | GPT-4o-Mini (OpenAI) | Answer generation | +| PDF parsing | PyMuPDF 1.24 | Text extraction from PDF files | -If you prefer to clone the project and run on local environment, you will require: +`text-embedding-3-small` replaces ChromaDB's default (`all-MiniLM-L6-v2`) for better semantic quality, especially across mixed-language content. -- Python ( developed with v3.11) -- OpenAI API Key -- Langchain -- ChromaDB -- Streamlit -- A code editor +### Why GPT-4o-Mini -## Setup +- Fast response times for conversational QA over retrieved context +- Lower cost per token than GPT-4o or GPT-4 Turbo +- Native LangChain integration +- Stable OpenAI API with no additional infrastructure -Follow the next steps to set up GnosisPages in your local environment: +### Why RAG -1. Clone this repository +The knowledge base is private, dynamic, and cannot be baked into model weights. RAG provides on-demand access to documents the LLM was never trained on, without exposing them to external services beyond the query moment. -```bash - git clone https://github.com/maclenn77/pdf-explainer.git -``` +--- + +## Features + +- **Upload PDFs** up to 200 MB (programmatically created or OCR-processed) +- **Semantic search** across your document collection — finds relevant content even without exact keyword matches +- **Conversational interface** — ask follow-up questions in the same session +- **Pre-loaded dataset** — the demo includes synthetic CVs so you can try it immediately without uploading anything +- **Private by design** — documents stay in your vector store; the LLM only sees retrieved chunks + +--- + +## Demo Usage + +The live demo on HuggingFace requires only an OpenAI API Key. + +**Example questions to try with the pre-loaded CV dataset:** -3. Navigate to the project directory -```bash - cd pdf-explainer ``` -4. Create your .env file +Who has experience with Python and machine learning? +Find candidates who have worked in startups or early-stage companies. +Who has the most experience in technical leadership roles? +Is there anyone with a background in both data engineering and backend development? +Which candidates mention experience with cloud infrastructure? +``` + +--- + +## Local Setup + +**Requirements:** Python 3.11, OpenAI API Key + ```bash - touch .env - nano .env # or your prefered text editor +# 1. Clone +git clone https://github.com/maclenn77/pdf-explainer.git +cd pdf-explainer + +# 2. Create environment file +touch .env ``` - And add your OpenAI API Key. -```yaml - OPENAI_API_KEY=YOUR_OPENAI_API_KEY + +Add your key to `.env`: + ``` -5. Install dependencies. -```bash - pip install -r requirements.txt +OPENAI_API_KEY=your_key_here ``` -6. Run on your local environment + ```bash - streamlit run GnosisPages.py +# 3. Install dependencies +pip install -r requirements.txt + +# 4. Run +streamlit run GnosisPages.py ``` +--- + ## Deployment -GnosisPages's repo includes workflows for deploying to HuggingFace. +The repo includes three GitHub Actions workflows that run on every PR and deploy automatically on merge to `main`: -1. **Check file size**: Prevents to merge and deploy files over the limit provided by HuggingFace 🤗. -2. **Check lints**: Analize the code with pylint. -3. **Deploy to HuggingFace**: Once a branch is merged into main, the last version is deployed on a HuggingFace Space. +| Workflow | What it does | +|---|---| +| Check file size | Blocks merges with files above HuggingFace's size limit | +| Check lints | Runs `pylint` on the codebase | +| Deploy to HuggingFace | Pushes the latest `main` to the HuggingFace Space | -For deploying, you need to add `HF_TOKEN` as secret in the settings of your fork and add a HuggingFace user with the variable name `HF_USERNAME`. +To deploy your own fork, add these secrets in your repository settings: -## Feedback and Contributions -If you have any feedback or would like to contribute to GnosisPages's development, please feel free to open issues or submit pull requests in the GitHub repository. +- `HF_TOKEN` — your HuggingFace access token +- `HF_USERNAME` — your HuggingFace username -## License -This project is licensed under the MIT License. See the LICENSE file for details. +--- + +## Project Structure + +``` +pdf-explainer/ +├── GnosisPages.py # App entry point +├── gnosis/ +│ ├── chroma_client.py # ChromaDB wrapper +│ ├── settings.py # Collection bootstrap (loads pre-built DB) +│ ├── gui_messages.py # UI copy +│ └── components/ +│ ├── sidebar.py # File upload and DB controls +│ └── main.py # Chat interface and RAG chain +├── pages/ # Additional Streamlit pages +├── requirements.txt +├── Dockerfile +└── .github/workflows/ # CI/CD +``` --- -Enjoy using GnosisPages to create and consult your knowled base! If you have any questions or encounter issues during the setup process, please don't hesitate to reach out for assistance. +## License + +MIT — see [LICENSE](LICENSE) for details. diff --git a/gnosis/agent.py b/gnosis/agent.py index 093d82a..6839ad9 100644 --- a/gnosis/agent.py +++ b/gnosis/agent.py @@ -1,8 +1,30 @@ """An Langchain Agent that uses ChromaDB as a query tool""" +import os +from pathlib import Path + from langchain.agents import AgentType, initialize_agent, load_tools from langchain.tools import Tool +from langchain.schema import SystemMessage from gnosis.search import Search +def load_system_prompt() -> SystemMessage: + """Load system prompt from a markdown file defined in SYSTEM_PROMPT_PATH, + or fall back to SYSTEM_PROMPT env var, or use a default.""" + prompt_path = os.getenv("SYSTEM_PROMPT_PATH") + if prompt_path: + path = Path(prompt_path) + if path.exists(): + return SystemMessage(content=path.read_text(encoding="utf-8")) + + fallback = os.getenv("SYSTEM_PROMPT") + if fallback: + return SystemMessage(content=fallback) + + return SystemMessage(content=( + "You are a helpful assistant with access to a private document database. " + "Only use information retrieved from the database or Wikipedia if tool is enabled. " + "If the answer is not there, say so clearly." + )) class PDFExplainer: """An Agent that uses ChromaDB as a query tool""" @@ -17,6 +39,7 @@ def __init__(self, llm, chroma_db, extra_tools=False): name="Search_on_ChromaDB", description="Useful when you need more context for answering a question.", handle_parsing_errors=True, + agent_kwargs={"system_message": load_system_prompt()}, ) ] @@ -26,8 +49,7 @@ def __init__(self, llm, chroma_db, extra_tools=False): self.agent = initialize_agent( self.tools, llm, - #agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, - agent=AgentType.OPENAI_FUNCTIONS, # ← uses OpenAI's native function calling + agent=AgentType.OPENAI_FUNCTIONS, verbose=False, handle_parsing_errors=True, ) diff --git a/gnosis/search.py b/gnosis/search.py index 5eb10b0..8a1f344 100644 --- a/gnosis/search.py +++ b/gnosis/search.py @@ -1,6 +1,5 @@ """Search Tool""" - class Search: """Search Tool""" @@ -10,7 +9,18 @@ def __init__(self, chroma_db): def run(self, query: str): """Run the Agent""" + if not self.chroma_db.api_key: + return "No API key set. Please add your OpenAI API key in the sidebar." + collection = self.chroma_db.get_collection("pdf-explainer") + + # get_collection returns a Streamlit object if something went wrong + if not hasattr(collection, "query"): + return "Could not access the collection. Please check your API key." + + if collection.count() == 0: + return "No documents found. Please upload a PDF first." + results = collection.query(query_texts=[query], n_results=5)["documents"][0] return "\n\n---\n\n".join(results) diff --git a/prompts/demo-prompt.md b/prompts/demo-prompt.md new file mode 100644 index 0000000..59245b3 --- /dev/null +++ b/prompts/demo-prompt.md @@ -0,0 +1,10 @@ +You are a recruitment assistant specialized in analyzing CVs and candidate profiles. +You have access to a private database of CVs. Use it to answer questions about candidates. + +Guidelines: +- Only use information retrieved from the database. Never invent candidates or experiences. +- If the information is not in the database, say so clearly. +- When listing candidates, include their name, relevant skills, and years of experience if available. +- You can compare candidates when asked. +- When you're inquired to topics not related with your expertise or you lack current info (as laboral laws or industry data), mention when was your last update and that you don't have access to more recent data +- Respond in the same language the user writes in.