# src/scripts/embed_excel_to_qdrant.py import hashlib import uuid import pandas as pd from sqlalchemy import text from agno.knowledge.document import Document import os from urllib.parse import quote_plus from sqlalchemy import create_engine from dotenv import load_dotenv # Import your custom Qdrant and Embedding factories from src.knowledge.embedding_factory import EmbeddingFactory from src.knowledge.vector_store import get_qdrant_store load_dotenv() # --- Database Setup --- db_user = quote_plus(os.getenv("DB_USER")) db_pass = quote_plus(os.getenv("DB_PASSWORD")) db_host = os.getenv("DB_HOST") db_port = os.getenv("DB_PORT") db_name = os.getenv("DB_NAME") db_url = f"postgresql+psycopg://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}" engine = create_engine(db_url) # Read the file that the recovery script just finished EXCEL_FILE = "wiki_lang69_cleaned_final.xlsx" BATCH_SIZE = 100 # Number of vectors to send to Qdrant at once def get_text_from_json(json_data, target_lang='fa'): """Helper to safely extract exactly the Persian text from the JSON arrays.""" if not json_data or not isinstance(json_data, list): return "Unknown" # 1. Strictly look for the requested language ('fa') for entry in json_data: if isinstance(entry, dict) and entry.get('language_code') == target_lang: return entry.get('text', 'Unknown') # 2. Fallback: If no Persian translation exists, grab the first available language if len(json_data) > 0 and isinstance(json_data[0], dict): return json_data[0].get('text', 'Unknown') return "Unknown" def run_qdrant_ingestion(): print(f"📂 Loading cleaned text from {EXCEL_FILE}...") try: df = pd.read_excel(EXCEL_FILE) except FileNotFoundError: print(f"❌ Could not find {EXCEL_FILE}.") return # 1. Filter out any rows that still have errors or empty text valid_df = df[df['error'].isna() & df['clean_text'].notna() & (df['clean_text'] != '')] valid_ids = valid_df['id'].tolist() total_docs = len(valid_ids) print(f"📊 Found {total_docs} valid texts ready for embedding.") if total_docs == 0: return # 2. Fetch all relational metadata from PostgreSQL print("📥 Fetching relational metadata (Titles, Authors, Categories) from Database...") query = text(""" SELECT wc.id as content_id, wc.wiki_id, wc.language as lang_code, w.title as wiki_titles, a.id as author_id, a.name as author_names, c.id as category_id, c.name as cat_names FROM wiki_wikicontent wc JOIN wiki_wiki w ON wc.wiki_id = w.id LEFT JOIN wiki_author a ON w.author_id = a.id JOIN wiki_wikicategory c ON w.category_id = c.id WHERE wc.id IN :ids """) with engine.connect() as conn: metadata_rows = conn.execute(query, {"ids": tuple(valid_ids)}).fetchall() metadata_lookup = {row.content_id: row for row in metadata_rows} # 3. Initialize Qdrant print("🤖 Initializing Embedding Model and Qdrant Connection...") embed_factory = EmbeddingFactory() embedder = embed_factory.get_embedder() vector_db = get_qdrant_store(embedder=embedder) active_collection = vector_db.collection # We need to track the model name to update the database later model_json_str = f'["{active_collection}"]' documents_to_upsert = [] processed_count = 0 print(f"🚀 Starting batch embedding into collection: {active_collection}") # 4. Process valid Excel rows and build Agno Documents for _, row in valid_df.iterrows(): content_id = row['id'] clean_text = row['clean_text'] db_meta = metadata_lookup.get(content_id) if not db_meta: continue # Extract strictly Persian strings wiki_title = get_text_from_json(db_meta.wiki_titles, 'fa') author_name = get_text_from_json(db_meta.author_names, 'fa') category_name = get_text_from_json(db_meta.cat_names, 'fa') # Part A: The Narrative String narrative_text = ( f"CATEGORY: {category_name}\n" f"WIKI TITLE: {wiki_title}\n" f"AUTHOR: {author_name}\n" f"CONTENT:\n{clean_text}" ) # Part B: The Strict Payload payload = { "source_type": "WIKI", "content_id": content_id, "wiki_id": db_meta.wiki_id, "wiki_title": wiki_title, "category_id": db_meta.category_id, "category_name": category_name, "author_id": db_meta.author_id, # Will be None if missing, which is fine "author_name": author_name, "language": db_meta.lang_code } # Deterministic Hash ID hash_id = hashlib.md5(f"WIKI_{content_id}_{active_collection}".encode()).hexdigest() qdrant_id = str(uuid.UUID(hash_id)) doc = Document( id=qdrant_id, content=narrative_text, meta_data=payload ) documents_to_upsert.append(doc) # 5. Batch Upsert to Qdrant if len(documents_to_upsert) >= BATCH_SIZE: vector_db.upsert(documents=documents_to_upsert) processed_count += len(documents_to_upsert) print(f"✅ Embedded {processed_count}/{total_docs} vectors...") documents_to_upsert = [] # Flush final partial batch if documents_to_upsert: vector_db.upsert(documents=documents_to_upsert) processed_count += len(documents_to_upsert) print(f"✅ Embedded {processed_count}/{total_docs} vectors...") # 6. Mark as synced in PostgreSQL print("🔄 Updating PostgreSQL to mark items as embedded...") with engine.begin() as conn: # Use .begin() for automatic transaction commit update_sql = text(""" UPDATE wiki_wikicontent SET embedded_in = CAST(COALESCE(embedded_in, '[]') AS jsonb) || CAST(:model_json AS jsonb) WHERE id IN :ids AND NOT (COALESCE(embedded_in, '[]')::jsonb @> :model_json::jsonb) """) conn.execute(update_sql, {"model_json": model_json_str, "ids": tuple(valid_ids)}) print("🎉 All documents successfully embedded and database updated!") if __name__ == "__main__": run_qdrant_ingestion()