imam-reza-agent/scripts/embed_excel_wiki.py


								# src/scripts/embed_excel_to_qdrant.py

								import hashlib

								import uuid

								import pandas as pd

								from sqlalchemy import text

								from agno.knowledge.document import Document

								import os

								from urllib.parse import quote_plus

								from sqlalchemy import create_engine

								from dotenv import load_dotenv


								# Import your custom Qdrant and Embedding factories

								from src.knowledge.embedding_factory import EmbeddingFactory

								from src.knowledge.vector_store import get_qdrant_store


								load_dotenv()


								# --- Database Setup ---

								db_user = quote_plus(os.getenv("DB_USER"))

								db_pass = quote_plus(os.getenv("DB_PASSWORD"))

								db_host = os.getenv("DB_HOST")

								db_port = os.getenv("DB_PORT")

								db_name = os.getenv("DB_NAME")

								db_url = f"postgresql+psycopg://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}"


								engine = create_engine(db_url)


								# Read the file that the recovery script just finished

								EXCEL_FILE = "wiki_lang69_cleaned_final.xlsx"

								BATCH_SIZE = 100  # Number of vectors to send to Qdrant at once


								def get_text_from_json(json_data, target_lang='fa'):

								    """Helper to safely extract exactly the Persian text from the JSON arrays."""

								    if not json_data or not isinstance(json_data, list):

								        return "Unknown"


								    # 1. Strictly look for the requested language ('fa')

								    for entry in json_data:

								        if isinstance(entry, dict) and entry.get('language_code') == target_lang:

								            return entry.get('text', 'Unknown')


								    # 2. Fallback: If no Persian translation exists, grab the first available language

								    if len(json_data) > 0 and isinstance(json_data[0], dict):

								        return json_data[0].get('text', 'Unknown')


								    return "Unknown"


								def run_qdrant_ingestion():

								    print(f"📂 Loading cleaned text from {EXCEL_FILE}...")

								    try:

								        df = pd.read_excel(EXCEL_FILE)

								    except FileNotFoundError:

								        print(f"❌ Could not find {EXCEL_FILE}.")

								        return


								    # 1. Filter out any rows that still have errors or empty text

								    valid_df = df[df['error'].isna() & df['clean_text'].notna() & (df['clean_text'] != '')]

								    valid_ids = valid_df['id'].tolist()


								    total_docs = len(valid_ids)

								    print(f"📊 Found {total_docs} valid texts ready for embedding.")


								    if total_docs == 0:

								        return


								    # 2. Fetch all relational metadata from PostgreSQL

								    print("📥 Fetching relational metadata (Titles, Authors, Categories) from Database...")


								    query = text("""

								        SELECT

								            wc.id as content_id,

								            wc.wiki_id,

								            wc.language as lang_code,

								            w.title as wiki_titles,

								            a.id as author_id,

								            a.name as author_names,

								            c.id as category_id,

								            c.name as cat_names

								        FROM wiki_wikicontent wc

								        JOIN wiki_wiki w ON wc.wiki_id = w.id

								        LEFT JOIN wiki_author a ON w.author_id = a.id

								        JOIN wiki_wikicategory c ON w.category_id = c.id

								        WHERE wc.id IN :ids

								    """)


								    with engine.connect() as conn:

								        metadata_rows = conn.execute(query, {"ids": tuple(valid_ids)}).fetchall()


								    metadata_lookup = {row.content_id: row for row in metadata_rows}


								    # 3. Initialize Qdrant

								    print("🤖 Initializing Embedding Model and Qdrant Connection...")

								    embed_factory = EmbeddingFactory()

								    embedder = embed_factory.get_embedder()

								    vector_db = get_qdrant_store(embedder=embedder)

								    active_collection = vector_db.collection


								    # We need to track the model name to update the database later

								    model_json_str = f'["{active_collection}"]'


								    documents_to_upsert = []

								    processed_count = 0


								    print(f"🚀 Starting batch embedding into collection: {active_collection}")


								    # 4. Process valid Excel rows and build Agno Documents

								    for _, row in valid_df.iterrows():

								        content_id = row['id']

								        clean_text = row['clean_text']


								        db_meta = metadata_lookup.get(content_id)

								        if not db_meta:

								            continue


								        # Extract strictly Persian strings

								        wiki_title = get_text_from_json(db_meta.wiki_titles, 'fa')

								        author_name = get_text_from_json(db_meta.author_names, 'fa')

								        category_name = get_text_from_json(db_meta.cat_names, 'fa')


								        # Part A: The Narrative String

								        narrative_text = (

								            f"CATEGORY: {category_name}\n"

								            f"WIKI TITLE: {wiki_title}\n"

								            f"AUTHOR: {author_name}\n"

								            f"CONTENT:\n{clean_text}"

								        )


								        # Part B: The Strict Payload

								        payload = {

								            "source_type": "WIKI",

								            "content_id": content_id,

								            "wiki_id": db_meta.wiki_id,

								            "wiki_title": wiki_title,

								            "category_id": db_meta.category_id,

								            "category_name": category_name,

								            "author_id": db_meta.author_id, # Will be None if missing, which is fine

								            "author_name": author_name,

								            "language": db_meta.lang_code

								        }


								        # Deterministic Hash ID

								        hash_id = hashlib.md5(f"WIKI_{content_id}_{active_collection}".encode()).hexdigest()

								        qdrant_id = str(uuid.UUID(hash_id))


								        doc = Document(

								            id=qdrant_id,

								            content=narrative_text,

								            meta_data=payload

								        )

								        documents_to_upsert.append(doc)


								        # 5. Batch Upsert to Qdrant

								        if len(documents_to_upsert) >= BATCH_SIZE:

								            vector_db.upsert(documents=documents_to_upsert)

								            processed_count += len(documents_to_upsert)

								            print(f"✅ Embedded {processed_count}/{total_docs} vectors...")

								            documents_to_upsert = []


								    # Flush final partial batch

								    if documents_to_upsert:

								        vector_db.upsert(documents=documents_to_upsert)

								        processed_count += len(documents_to_upsert)

								        print(f"✅ Embedded {processed_count}/{total_docs} vectors...")


								    # 6. Mark as synced in PostgreSQL

								    print("🔄 Updating PostgreSQL to mark items as embedded...")

								    with engine.begin() as conn: # Use .begin() for automatic transaction commit

								        update_sql = text("""

								            UPDATE wiki_wikicontent

								            SET embedded_in = CAST(COALESCE(embedded_in, '[]') AS jsonb) || CAST(:model_json AS jsonb)

								            WHERE id IN :ids AND NOT (COALESCE(embedded_in, '[]')::jsonb @> :model_json::jsonb)

								        """)

								        conn.execute(update_sql, {"model_json": model_json_str, "ids": tuple(valid_ids)})


								    print("🎉 All documents successfully embedded and database updated!")


								if __name__ == "__main__":

								    run_qdrant_ingestion()