| import chromadb |
| from sentence_transformers import SentenceTransformer |
| import os |
|
|
| |
| MODEL_NAME = "all-MiniLM-L6-v2" |
| COLLECTION_NAME = "aura_mind_knowledge" |
| KNOWLEDGE_BASE_DIR = "knowledge_base_data" |
|
|
| |
| client = chromadb.PersistentClient(path="chroma_db") |
| model = SentenceTransformer(MODEL_NAME) |
| collection = client.get_or_create_collection(name=COLLECTION_NAME) |
|
|
| def embed_and_store_documents(): |
| """ |
| Reads documents from the knowledge base directory, generates embeddings, |
| and stores them in ChromaDB. |
| """ |
| if collection.count() > 0: |
| print("✅ Knowledge base is already loaded into ChromaDB.") |
| return |
|
|
| print("Embedding and storing documents in ChromaDB...") |
| documents = [] |
| ids = [] |
| for filename in os.listdir(KNOWLEDGE_BASE_DIR): |
| if filename.endswith(".txt"): |
| with open(os.path.join(KNOWLEDGE_BASE_DIR, filename), "r") as f: |
| documents.append(f.read()) |
| ids.append(filename) |
|
|
| if documents: |
| embeddings = model.encode(documents).tolist() |
| collection.add( |
| embeddings=embeddings, |
| documents=documents, |
| ids=ids |
| ) |
| print(f"✅ Successfully stored {len(documents)} documents in ChromaDB.") |
|
|
| def search_documents(query: str, n_results: int = 1) -> list: |
| """ |
| Searches for relevant documents in ChromaDB based on a query. |
| |
| Args: |
| query: The search query. |
| n_results: The number of results to return. |
| |
| Returns: |
| A list of relevant documents. |
| """ |
| if not query: |
| return [] |
|
|
| query_embedding = model.encode([query]).tolist() |
| results = collection.query( |
| query_embeddings=query_embedding, |
| n_results=n_results, |
| ) |
| return results['documents'][0] if results['documents'] else [] |
|
|