You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

127 lines
4.1 KiB

import os
import pandas as pd
from dotenv import load_dotenv
from agno.knowledge.knowledge import Knowledge
from agno.vectordb.qdrant import Qdrant
from agno.vectordb.search import SearchType
import sys
from pathlib import Path
# -----------------------------------------------------------------------------
# DYNAMIC PATH SETUP
# This finds the project root automatically, whether run from root or tests/ folder
# -----------------------------------------------------------------------------
# Get the absolute path of this test file
current_file = Path(__file__).resolve()
# Find the 'src' directory by looking up the tree
# We look for the folder that contains 'src'
root_path = current_file.parent
while not (root_path / 'src').exists():
if root_path == root_path.parent: # Reached system root
raise FileNotFoundError("Could not find project root containing 'src' folder")
root_path = root_path.parent
# Add the project root to Python path
sys.path.insert(0, str(root_path))
print(f"🔧 Added project root to path: {root_path}")
# -----------------------------------------------------------------------------
from src.knowledge.embedding_factory import EmbeddingFactory
load_dotenv()
# --- 1. CONFIGURATION ---
qdrant_host = os.getenv("QDRANT_HOST")
qdrant_port = os.getenv("QDRANT_PORT")
qdrant_url = f"http://{qdrant_host}:{qdrant_port}"
collection_name = os.getenv("BASE_COLLECTION_NAME")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
# Matches the embedder used in app.py
embed_factory = EmbeddingFactory()
local_embedder = embed_factory.get_embedder()
collection_name = f"{collection_name}_{local_embedder.id}_hybrid"
print(f"****************************************************************")
print(f"Collection name: {collection_name}")
# Initialize Qdrant Vector DB
vector_db = Qdrant(
collection=collection_name, # positional or keyword is fine here
url=qdrant_url,
embedder=local_embedder,
timeout=30.0,
api_key=qdrant_api_key,
search_type=SearchType.hybrid
)
knowledge_base = Knowledge(vector_db=vector_db)
def ingest_hadiths(file_path: str):
print(f"📖 Processing Hadiths: {file_path}")
df = pd.read_excel(file_path)
count = 0
for _, row in df.iterrows():
content = (
f"HADITH TYPE: HADITH\n"
f"TITLE: {row.get('Title', '')}\n"
f"ARABIC: {row.get('Arabic Text', '')}\n"
f"TRANSLATION: {row.get('Translation', '')}\n"
f"SOURCE: {row.get('Source Info', '')}"
)
knowledge_base.add_content(text_content=content)
count += 1
print(f"✅ Successfully ingested {count} Hadiths into Qdrant.")
def ingest_articles(file_path: str):
print(f"📄 Processing Articles: {file_path}")
df = pd.read_excel(file_path)
count = 0
for _, row in df.iterrows():
content = (
f"ARTICLE TYPE: ARTICLE\n"
f"TITLE: {row.get('Title', '')}\n"
f"AUTHOR: {row.get('Author', '')}\n"
f"CONTENT: {row.get('Content', '')}\n"
f"URL: {row.get('URL', '')}"
)
knowledge_base.add_content(text_content=content)
count += 1
print(f"✅ Successfully ingested {count} Articles into Qdrant.")
if __name__ == "__main__":
print("--- 🚀 Starting Data Ingestion to Qdrant ---")
SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
# 2. Go up one level to the Project Root
PROJECT_ROOT = os.path.dirname(SCRIPTS_DIR)
# 3. Build the path to the data folder
DATA_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
# 4. Define your file paths
HADITH_FILE = os.path.join(DATA_DIR, "hadiths_data.xlsx")
ARTICLE_FILE = os.path.join(DATA_DIR, "dovodi_articles.xlsx")
try:
# Ingest Hadiths
if os.path.exists(HADITH_FILE):
ingest_hadiths(HADITH_FILE)
else:
print(f"⚠️ {HADITH_FILE} not found!")
# Ingest Articles
if os.path.exists(ARTICLE_FILE):
ingest_articles(ARTICLE_FILE)
else:
print(f"⚠️ {ARTICLE_FILE} not found!")
print("--- ✨ Ingestion Complete ---")
except Exception as e:
print(f"❌ Error during ingestion: {e}")