OpenLake · kushal281 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · coderabbitai
diff --git a/.env.example b/.env.example
@@ -0,0 +1,11 @@
+GITHUB_WEBHOOK_SECRET=
+GITHUB_TOKEN=
+REPO_NAME= #Enter repo from where you want to fetch issues
+SIMILARITY_THRESHOLD=
+MY_REPO_NAME= #Enter repo for which you want to use 
+
+GITHUB_TOKEN=your_token_here
+GITHUB_WEBHOOK_SECRET=your_secret_here
+REPO_NAME=OpenLake/your-repo
+TARGET_REPO=your-test-repo
+SIMILARITY_THRESHOLD=0.85
-GITHUB_WEBHOOK_SECRET=
-GITHUB_TOKEN=
-REPO_NAME= #Enter repo from where you want to fetch issues
-SIMILARITY_THRESHOLD=
-MY_REPO_NAME= #Enter repo for which you want to use 
-
-GITHUB_TOKEN=your_token_here
-GITHUB_WEBHOOK_SECRET=your_secret_here
-REPO_NAME=OpenLake/your-repo
-TARGET_REPO=your-test-repo
-SIMILARITY_THRESHOLD=0.85
+GITHUB_TOKEN=your_token_here
+GITHUB_WEBHOOK_SECRET=your_secret_here
+SOURCE_REPO=OpenLake/your-repo
+TARGET_REPO=your-test-repo
+SIMILARITY_THRESHOLD=0.85
-GITHUB_WEBHOOK_SECRET=
-GITHUB_TOKEN=
-REPO_NAME= #Enter repo from where you want to fetch issues
-SIMILARITY_THRESHOLD=
-MY_REPO_NAME= #Enter repo for which you want to use 
-
-GITHUB_TOKEN=your_token_here
-GITHUB_WEBHOOK_SECRET=your_secret_here
-REPO_NAME=OpenLake/your-repo
-TARGET_REPO=your-test-repo
-SIMILARITY_THRESHOLD=0.85
+GITHUB_TOKEN=your_token_here
+GITHUB_WEBHOOK_SECRET=your_secret_here
+SOURCE_REPO=OpenLake/your-repo
+TARGET_REPO=your-test-repo
+SIMILARITY_THRESHOLD=0.85
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,19 @@
+# Python
+__pycache__/
+*.pyc
+
+# Virtual environment
+venv/
+.env
+
+# Chroma database
+data/issues.json
+data/chroma/chroma.sqlite3
+data/chroma/defcbcc9-4578-4553-b3f4-648cbe1c763b
+
+# IDE
+.vscode/
+.idea/
+
+# OS
+.DS_Store
diff --git a/README.md b/README.md
@@ -1,3 +1,6 @@
+<<<<<<< HEAD
+
+=======
 # SmartTriage
 
 An intelligent GitHub bot that automates issue triage and PR reviewer assignment using vector similarity search and commit history analysis for OpenLake repositories.
@@ -258,3 +261,4 @@ Refer to `app/api/webhooks.py` for the full webhook routing logic.
 ↥ [Back to top](#table-of-contents)
 
 If you have any questions or feedback, feel free to reach out to the maintainers or open an issue in the repository.
+>>>>>>> c703023b2e677d75aef064a1828fc5d857b18b68
diff --git a/app/api/webhooks.py b/app/api/webhooks.py
@@ -0,0 +1,51 @@
+from fastapi import APIRouter, Request
+
+from app.ml.duplicate import detect_duplicate
+from app.core.github import GitHubClient
+
+router = APIRouter()
+
+github = GitHubClient()
+
+@router.post("/webhook")
+async def github_webhook(request: Request):
+
+    event = request.headers.get("X-GitHub-Event")
+
+    print(f"GitHub Event: {event}")
+
+    try:
+        payload = await request.json()
+    except Exception:
+        payload = {}
+
+    if event == "issues":
+
+        action = payload.get("action")
+
+        if action == "opened":
+            issue_text = (
+                payload["issue"]["title"]
+                + "\n"
+                + (payload["issue"]["body"] or "")
+            )
+
+            result = detect_duplicate(issue_text)
+            print("Duplicate result:", result)
+
+            if result and result["duplicate"]:
+                github.comment_issue(
+                    payload["issue"]["number"],
+
+                    f"""
+Possible duplicate issue detected.
+
+Similar issue:
+{result['issue']['url']}
+
+Similarity:
+{result['similarity']:.2f}
+"""
+                )
+
+    return {"ok": True}
diff --git a/app/config.py b/app/config.py
@@ -0,0 +1,10 @@
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+
+WEBHOOK_SECRET = os.getenv("GITHUB_WEBHOOK_SECRET")
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
+TARGET_REPO = os.getenv("TARGET_REPO")
+SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", 0.85))
+SOURCE_REPO = os.getenv("SOURCE_REPO")
diff --git a/app/core/github.py b/app/core/github.py
@@ -0,0 +1,42 @@
+from github import Github
+from app.config import GITHUB_TOKEN
+from app.config import TARGET_REPO
+
+class GitHubClient:
+
+    def __init__(self):
+        self.client = Github(GITHUB_TOKEN)
+
+
+    def get_repo(self, repo_name):
+        return self.client.get_repo(repo_name)
+
+
+    def fetch_issues(self, repo_name):
+        repo = self.get_repo(repo_name)
+
+        issues = []
+
+        issue_iterator = repo.get_issues(state="all")
+
+        for issue in issue_iterator:
+
+            # skip pull requests
+            if issue.pull_request:
+                continue
+
+            issues.append({
+                "id": issue.id,
+                "title": issue.title,
+                "body": issue.body,
+                "url": issue.html_url,
+                "created_at": str(issue.created_at),
+                "state": issue.state
+            })
+
+        return issues
+
+    def comment_issue(self, issue_number, comment):
+        self.repo = self.get_repo(TARGET_REPO)
+        issue = self.repo.get_issue(number=issue_number)
+        issue.create_comment(comment)
diff --git a/app/db/ingest.py b/app/db/ingest.py
@@ -0,0 +1,42 @@
+from app.core.github import GitHubClient
+from app.ml.clean import clean_issue
+from app.db.vector import add_issue
+from app.config import SOURCE_REPO
+import json
+import os
+
+def main():
+
+    github = GitHubClient()
+    issues = github.fetch_issues(SOURCE_REPO)
+
+    uncleaned_issues = []
+
+    for issue in issues:
+        cleaned = clean_issue(issue)
+        add_issue(
+            cleaned["id"],
+            cleaned["text"],
+            cleaned["vector"],
+            {
+                "state": cleaned["state"],
+                "url": cleaned["url"]
+            }
+        )
+        uncleaned_issues.append({
+            "id": issue['id'],
+            "title": issue['title'],
+            "body": issue['body'],
+            "state": issue['state'],
+            "url": issue['url']
+        })
+
+    os.makedirs("data", exist_ok=True)
+
+    with open("data/issues.json", "w", encoding="utf-8") as f:
+        json.dump(uncleaned_issues, f, indent=4, ensure_ascii=False)
+    print(f"Stored {len(issues)} issues")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/app/db/vector.py b/app/db/vector.py
@@ -0,0 +1,24 @@
+import chromadb
+from chromadb.config import Settings
+
+client = chromadb.PersistentClient(path="./data/chroma")
+
+
+collection = client.get_or_create_collection(name="github_issues")
-collection = client.get_or_create_collection(name="github_issues")
+collection = client.get_or_create_collection(
+    name="github_issues",
+    metadata={"hnsw:space": "cosine"},
+)
-collection = client.get_or_create_collection(name="github_issues")
+collection = client.get_or_create_collection(
+    name="github_issues",
+    metadata={"hnsw:space": "cosine"},
+)
+
+
+def add_issue(issue_id, text, vector, metadata):
+    collection.upsert(
+        ids=[str(issue_id)],
+        documents=[text],
+        embeddings=[vector],
+        metadatas=[metadata]
+    )
+
+
+def search_similar_issue(embedding, limit=1):
+    results = collection.query(
+        query_embeddings=[embedding],
+        n_results=limit
+    )
+    return results
diff --git a/app/main.py b/app/main.py
@@ -0,0 +1,10 @@
+from fastapi import FastAPI
+from app.api.webhooks import router
+
+app = FastAPI(title="SmartTriage")
+
+app.include_router(router)
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}
diff --git a/app/ml/clean.py b/app/ml/clean.py
@@ -0,0 +1,22 @@
+from app.ml.embedder import Embedder
+
+embedder = Embedder()
+
+def clean_issue(issue):
+    text = ""
+
+    if issue["title"]:
+        text += issue["title"]
+
+    if issue["body"]:
+        text += "\n" + issue["body"]
+
+    vector = embedder.generate_embedding(text)
+
+    return {
+        "id": issue["id"],
+        "text": text.strip(),
+        "url": issue["url"],
+        "state": issue["state"],
+        "vector": vector
+    }
diff --git a/app/ml/duplicate.py b/app/ml/duplicate.py
@@ -0,0 +1,30 @@
+from app.ml.embedder import Embedder
+from app.db.vector import search_similar_issue
+from app.config import SIMILARITY_THRESHOLD
+
+embedder = Embedder()
+
+def detect_duplicate(issue_text):
+
+    embedding = embedder.generate_embedding(issue_text)
+
+    result = search_similar_issue(embedding)
+
+
+    if not result["distances"][0]:
+        return None
+
+    distance = result["distances"][0][0]
+    similarity = max(0, 1 - distance)
+
+    if similarity >= SIMILARITY_THRESHOLD:
+        return {
+            "duplicate": True,
+            "similarity": similarity,
+            "issue": result["metadatas"][0][0]
+        }
+
+    return {
+        "duplicate": False,
+        "similarity": similarity
+    }
diff --git a/app/ml/embedder.py b/app/ml/embedder.py
@@ -0,0 +1,9 @@
+from sentence_transformers import SentenceTransformer
+
+class Embedder:
+    def __init__(self):
+        self.model = SentenceTransformer("all-MiniLM-L6-v2")
+
+    def generate_embedding(self, text):
+        embedding = self.model.encode(text)
+        return embedding.tolist()
diff --git a/app/test.py b/app/test.py
@@ -0,0 +1,19 @@
+from app.ml.embedder import Embedder
+from app.db.vector import search_similar
+
+
+embedder = Embedder()
+
+
+text = "Cannot login into application"
+
+
+embedding = embedder.generate_embedding(text)
+
+
+result = search_similar(
+    embedding
+)
-from app.db.vector import search_similar
-
-
-embedder = Embedder()
-
-
-text = "Cannot login into application"
-
-
-embedding = embedder.generate_embedding(text)
-
-
-result = search_similar(
-    embedding
-)
+from app.db.vector import search_similar_issue
+
+
+embedder = Embedder()
+
+
+text = "Cannot login into application"
+
+
+embedding = embedder.generate_embedding(text)
+
+
+result = search_similar_issue(
+    embedding
+)
-from app.db.vector import search_similar
-
-
-embedder = Embedder()
-
-
-text = "Cannot login into application"
-
-
-embedding = embedder.generate_embedding(text)
-
-
-result = search_similar(
-    embedding
-)
+from app.db.vector import search_similar_issue
+
+
+embedder = Embedder()
+
+
+text = "Cannot login into application"
+
+
+embedding = embedder.generate_embedding(text)
+
+
+result = search_similar_issue(
+    embedding
+)
+
+
+print(result)
diff --git a/requirements.txt b/requirements.txt