import os import pandas as pd from dotenv import load_dotenv from agno.knowledge.knowledge import Knowledge from agno.vectordb.qdrant import Qdrant from agno.vectordb.search import SearchType import sys from pathlib import Path # ----------------------------------------------------------------------------- # DYNAMIC PATH SETUP # This finds the project root automatically, whether run from root or tests/ folder # ----------------------------------------------------------------------------- # Get the absolute path of this test file current_file = Path(__file__).resolve() # Find the 'src' directory by looking up the tree # We look for the folder that contains 'src' root_path = current_file.parent while not (root_path / 'src').exists(): if root_path == root_path.parent: # Reached system root raise FileNotFoundError("Could not find project root containing 'src' folder") root_path = root_path.parent # Add the project root to Python path sys.path.insert(0, str(root_path)) print(f"🔧 Added project root to path: {root_path}") # ----------------------------------------------------------------------------- from src.knowledge.embedding_factory import EmbeddingFactory load_dotenv() # --- 1. CONFIGURATION --- qdrant_host = os.getenv("QDRANT_HOST") qdrant_port = os.getenv("QDRANT_PORT") qdrant_url = f"http://{qdrant_host}:{qdrant_port}" collection_name = os.getenv("BASE_COLLECTION_NAME") qdrant_api_key = os.getenv("QDRANT_API_KEY") # Matches the embedder used in app.py embed_factory = EmbeddingFactory() local_embedder = embed_factory.get_embedder() collection_name = f"{collection_name}_{local_embedder.id}_hybrid" print(f"****************************************************************") print(f"Collection name: {collection_name}") # Initialize Qdrant Vector DB vector_db = Qdrant( collection=collection_name, # positional or keyword is fine here url=qdrant_url, embedder=local_embedder, timeout=30.0, api_key=qdrant_api_key, search_type=SearchType.hybrid ) knowledge_base = Knowledge(vector_db=vector_db) def ingest_hadiths(file_path: str): print(f"📖 Processing Hadiths: {file_path}") df = pd.read_excel(file_path) count = 0 for _, row in df.iterrows(): content = ( f"HADITH TYPE: HADITH\n" f"TITLE: {row.get('Title', '')}\n" f"ARABIC: {row.get('Arabic Text', '')}\n" f"TRANSLATION: {row.get('Translation', '')}\n" f"SOURCE: {row.get('Source Info', '')}" ) knowledge_base.add_content(text_content=content) count += 1 print(f"✅ Successfully ingested {count} Hadiths into Qdrant.") def ingest_articles(file_path: str): print(f"📄 Processing Articles: {file_path}") df = pd.read_excel(file_path) count = 0 for _, row in df.iterrows(): content = ( f"ARTICLE TYPE: ARTICLE\n" f"TITLE: {row.get('Title', '')}\n" f"AUTHOR: {row.get('Author', '')}\n" f"CONTENT: {row.get('Content', '')}\n" f"URL: {row.get('URL', '')}" ) knowledge_base.add_content(text_content=content) count += 1 print(f"✅ Successfully ingested {count} Articles into Qdrant.") if __name__ == "__main__": print("--- 🚀 Starting Data Ingestion to Qdrant ---") SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__)) # 2. Go up one level to the Project Root PROJECT_ROOT = os.path.dirname(SCRIPTS_DIR) # 3. Build the path to the data folder DATA_DIR = os.path.join(PROJECT_ROOT, "data", "raw") # 4. Define your file paths HADITH_FILE = os.path.join(DATA_DIR, "hadiths_data.xlsx") ARTICLE_FILE = os.path.join(DATA_DIR, "dovodi_articles.xlsx") try: # Ingest Hadiths if os.path.exists(HADITH_FILE): ingest_hadiths(HADITH_FILE) else: print(f"⚠️ {HADITH_FILE} not found!") # Ingest Articles if os.path.exists(ARTICLE_FILE): ingest_articles(ARTICLE_FILE) else: print(f"⚠️ {ARTICLE_FILE} not found!") print("--- ✨ Ingestion Complete ---") except Exception as e: print(f"❌ Error during ingestion: {e}")