You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
127 lines
4.1 KiB
127 lines
4.1 KiB
import os
|
|
import pandas as pd
|
|
from dotenv import load_dotenv
|
|
from agno.knowledge.knowledge import Knowledge
|
|
from agno.vectordb.qdrant import Qdrant
|
|
from agno.vectordb.search import SearchType
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# DYNAMIC PATH SETUP
|
|
# This finds the project root automatically, whether run from root or tests/ folder
|
|
# -----------------------------------------------------------------------------
|
|
# Get the absolute path of this test file
|
|
current_file = Path(__file__).resolve()
|
|
|
|
# Find the 'src' directory by looking up the tree
|
|
# We look for the folder that contains 'src'
|
|
root_path = current_file.parent
|
|
while not (root_path / 'src').exists():
|
|
if root_path == root_path.parent: # Reached system root
|
|
raise FileNotFoundError("Could not find project root containing 'src' folder")
|
|
root_path = root_path.parent
|
|
|
|
# Add the project root to Python path
|
|
sys.path.insert(0, str(root_path))
|
|
print(f"🔧 Added project root to path: {root_path}")
|
|
# -----------------------------------------------------------------------------
|
|
|
|
from src.knowledge.embedding_factory import EmbeddingFactory
|
|
load_dotenv()
|
|
|
|
# --- 1. CONFIGURATION ---
|
|
qdrant_host = os.getenv("QDRANT_HOST")
|
|
qdrant_port = os.getenv("QDRANT_PORT")
|
|
qdrant_url = f"http://{qdrant_host}:{qdrant_port}"
|
|
collection_name = os.getenv("BASE_COLLECTION_NAME")
|
|
qdrant_api_key = os.getenv("QDRANT_API_KEY")
|
|
# Matches the embedder used in app.py
|
|
embed_factory = EmbeddingFactory()
|
|
local_embedder = embed_factory.get_embedder()
|
|
collection_name = f"{collection_name}_{local_embedder.id}_hybrid"
|
|
|
|
print(f"****************************************************************")
|
|
print(f"Collection name: {collection_name}")
|
|
|
|
# Initialize Qdrant Vector DB
|
|
vector_db = Qdrant(
|
|
collection=collection_name, # positional or keyword is fine here
|
|
url=qdrant_url,
|
|
embedder=local_embedder,
|
|
timeout=30.0,
|
|
api_key=qdrant_api_key,
|
|
search_type=SearchType.hybrid
|
|
)
|
|
|
|
knowledge_base = Knowledge(vector_db=vector_db)
|
|
|
|
|
|
def ingest_hadiths(file_path: str):
|
|
print(f"📖 Processing Hadiths: {file_path}")
|
|
df = pd.read_excel(file_path)
|
|
count = 0
|
|
|
|
for _, row in df.iterrows():
|
|
content = (
|
|
f"HADITH TYPE: HADITH\n"
|
|
f"TITLE: {row.get('Title', '')}\n"
|
|
f"ARABIC: {row.get('Arabic Text', '')}\n"
|
|
f"TRANSLATION: {row.get('Translation', '')}\n"
|
|
f"SOURCE: {row.get('Source Info', '')}"
|
|
)
|
|
knowledge_base.add_content(text_content=content)
|
|
count += 1
|
|
|
|
print(f"✅ Successfully ingested {count} Hadiths into Qdrant.")
|
|
|
|
|
|
def ingest_articles(file_path: str):
|
|
print(f"📄 Processing Articles: {file_path}")
|
|
df = pd.read_excel(file_path)
|
|
count = 0
|
|
|
|
for _, row in df.iterrows():
|
|
content = (
|
|
f"ARTICLE TYPE: ARTICLE\n"
|
|
f"TITLE: {row.get('Title', '')}\n"
|
|
f"AUTHOR: {row.get('Author', '')}\n"
|
|
f"CONTENT: {row.get('Content', '')}\n"
|
|
f"URL: {row.get('URL', '')}"
|
|
)
|
|
knowledge_base.add_content(text_content=content)
|
|
count += 1
|
|
|
|
print(f"✅ Successfully ingested {count} Articles into Qdrant.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("--- 🚀 Starting Data Ingestion to Qdrant ---")
|
|
SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
# 2. Go up one level to the Project Root
|
|
PROJECT_ROOT = os.path.dirname(SCRIPTS_DIR)
|
|
|
|
# 3. Build the path to the data folder
|
|
DATA_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
|
|
|
|
# 4. Define your file paths
|
|
HADITH_FILE = os.path.join(DATA_DIR, "hadiths_data.xlsx")
|
|
ARTICLE_FILE = os.path.join(DATA_DIR, "dovodi_articles.xlsx")
|
|
|
|
try:
|
|
# Ingest Hadiths
|
|
if os.path.exists(HADITH_FILE):
|
|
ingest_hadiths(HADITH_FILE)
|
|
else:
|
|
print(f"⚠️ {HADITH_FILE} not found!")
|
|
|
|
# Ingest Articles
|
|
if os.path.exists(ARTICLE_FILE):
|
|
ingest_articles(ARTICLE_FILE)
|
|
else:
|
|
print(f"⚠️ {ARTICLE_FILE} not found!")
|
|
|
|
print("--- ✨ Ingestion Complete ---")
|
|
except Exception as e:
|
|
print(f"❌ Error during ingestion: {e}")
|