Browse Source
Update environment configurations, modify Docker settings, and add new scripts for data processing. Adjusted Jenkinsfile for deployment, updated .gitignore, and refined agent instructions for improved user interaction.
master
Update environment configurations, modify Docker settings, and add new scripts for data processing. Adjusted Jenkinsfile for deployment, updated .gitignore, and refined agent instructions for improved user interaction.
master
9 changed files with 544 additions and 74 deletions
-
1.gitignore
-
66Jenkinsfile
-
26config/production.env
-
18docker-compose.yml
-
178scripts/embed_excel_wiki.py
-
2src/agents/base_agent.py
-
40src/utils/load_settings.py
-
144src/utils/recovery_reformat.py
-
139src/utils/reformat_wiki.py
@ -1,34 +1,34 @@ |
|||||
// pipeline { |
|
||||
// environment { |
|
||||
// develop_server_ip = '' |
|
||||
// develop_server_name = '' |
|
||||
// production_server_ip = "88.99.212.243" |
|
||||
// production_server_name = "newhorizon_germany_001_server" |
|
||||
// project_path = "/projects/dovodi/agent" |
|
||||
// version = "master" |
|
||||
// gitBranch = "origin/master" |
|
||||
// } |
|
||||
// agent any |
|
||||
// stages { |
|
||||
// stage('deploy'){ |
|
||||
// steps{ |
|
||||
// script{ |
|
||||
// if(gitBranch=="origin/master"){ |
|
||||
// withCredentials([usernamePassword(credentialsId: production_server_name, usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { |
|
||||
// sh 'sshpass -p $PASSWORD ssh -p 1782 $USERNAME@$production_server_ip -o StrictHostKeyChecking=no "cd $project_path && ./runner.sh"' |
|
||||
|
pipeline { |
||||
|
environment { |
||||
|
develop_server_ip = '' |
||||
|
develop_server_name = '' |
||||
|
production_server_ip = "88.99.212.243" |
||||
|
production_server_name = "newhorizon_germany_001_server" |
||||
|
project_path = "/projects/imam-reza-shrine/imam_reza_shrine_agent |
||||
|
version = "master" |
||||
|
gitBranch = "origin/master" |
||||
|
} |
||||
|
agent any |
||||
|
stages { |
||||
|
stage('deploy'){ |
||||
|
steps{ |
||||
|
script{ |
||||
|
if(gitBranch=="origin/master"){ |
||||
|
withCredentials([usernamePassword(credentialsId: production_server_name, usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { |
||||
|
sh 'sshpass -p $PASSWORD ssh -p 1782 $USERNAME@$production_server_ip -o StrictHostKeyChecking=no "cd $project_path && ./runner.sh"' |
||||
|
|
||||
// def lastCommit = sh(script: 'git log -1 --pretty=format:"%h - %s (%an)"', returnStdout: true).trim() |
|
||||
// sh """ |
|
||||
// curl -F chat_id=1457670318 \ |
|
||||
// -F message_thread_id=6 \ |
|
||||
// -F document=@/var/jenkins_home/jobs/${env.JOB_NAME}/builds/${env.BUILD_NUMBER}/log \ |
|
||||
// -F caption='Project name: #${env.JOB_NAME} \nBuild status is ${currentBuild.currentResult} \nBuild url: ${BUILD_URL} \nLast Commit: ${lastCommit}' \ |
|
||||
// https://api.telegram.org/bot7207581748:AAFeymryw7S44D86LYfWqYK-tSNeV3TOwBs/sendDocument |
|
||||
// """ |
|
||||
// } |
|
||||
// } |
|
||||
// } |
|
||||
// } |
|
||||
// } |
|
||||
// } |
|
||||
// } |
|
||||
|
def lastCommit = sh(script: 'git log -1 --pretty=format:"%h - %s (%an)"', returnStdout: true).trim() |
||||
|
sh """ |
||||
|
curl -F chat_id=1457670318 \ |
||||
|
-F message_thread_id=6 \ |
||||
|
-F document=@/var/jenkins_home/jobs/${env.JOB_NAME}/builds/${env.BUILD_NUMBER}/log \ |
||||
|
-F caption='Project name: #${env.JOB_NAME} \nBuild status is ${currentBuild.currentResult} \nBuild url: ${BUILD_URL} \nLast Commit: ${lastCommit}' \ |
||||
|
https://api.telegram.org/bot7207581748:AAFeymryw7S44D86LYfWqYK-tSNeV3TOwBs/sendDocument |
||||
|
""" |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,178 @@ |
|||||
|
# src/scripts/embed_excel_to_qdrant.py |
||||
|
import hashlib |
||||
|
import uuid |
||||
|
import pandas as pd |
||||
|
from sqlalchemy import text |
||||
|
from agno.knowledge.document import Document |
||||
|
import os |
||||
|
from urllib.parse import quote_plus |
||||
|
from sqlalchemy import create_engine |
||||
|
from dotenv import load_dotenv |
||||
|
|
||||
|
# Import your custom Qdrant and Embedding factories |
||||
|
from src.knowledge.embedding_factory import EmbeddingFactory |
||||
|
from src.knowledge.vector_store import get_qdrant_store |
||||
|
|
||||
|
load_dotenv() |
||||
|
|
||||
|
# --- Database Setup --- |
||||
|
db_user = quote_plus(os.getenv("DB_USER")) |
||||
|
db_pass = quote_plus(os.getenv("DB_PASSWORD")) |
||||
|
db_host = os.getenv("DB_HOST") |
||||
|
db_port = os.getenv("DB_PORT") |
||||
|
db_name = os.getenv("DB_NAME") |
||||
|
db_url = f"postgresql+psycopg://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}" |
||||
|
|
||||
|
engine = create_engine(db_url) |
||||
|
|
||||
|
# Read the file that the recovery script just finished |
||||
|
EXCEL_FILE = "wiki_lang69_cleaned_final.xlsx" |
||||
|
BATCH_SIZE = 100 # Number of vectors to send to Qdrant at once |
||||
|
|
||||
|
def get_text_from_json(json_data, target_lang='fa'): |
||||
|
"""Helper to safely extract exactly the Persian text from the JSON arrays.""" |
||||
|
if not json_data or not isinstance(json_data, list): |
||||
|
return "Unknown" |
||||
|
|
||||
|
# 1. Strictly look for the requested language ('fa') |
||||
|
for entry in json_data: |
||||
|
if isinstance(entry, dict) and entry.get('language_code') == target_lang: |
||||
|
return entry.get('text', 'Unknown') |
||||
|
|
||||
|
# 2. Fallback: If no Persian translation exists, grab the first available language |
||||
|
if len(json_data) > 0 and isinstance(json_data[0], dict): |
||||
|
return json_data[0].get('text', 'Unknown') |
||||
|
|
||||
|
return "Unknown" |
||||
|
|
||||
|
def run_qdrant_ingestion(): |
||||
|
print(f"📂 Loading cleaned text from {EXCEL_FILE}...") |
||||
|
try: |
||||
|
df = pd.read_excel(EXCEL_FILE) |
||||
|
except FileNotFoundError: |
||||
|
print(f"❌ Could not find {EXCEL_FILE}.") |
||||
|
return |
||||
|
|
||||
|
# 1. Filter out any rows that still have errors or empty text |
||||
|
valid_df = df[df['error'].isna() & df['clean_text'].notna() & (df['clean_text'] != '')] |
||||
|
valid_ids = valid_df['id'].tolist() |
||||
|
|
||||
|
total_docs = len(valid_ids) |
||||
|
print(f"📊 Found {total_docs} valid texts ready for embedding.") |
||||
|
|
||||
|
if total_docs == 0: |
||||
|
return |
||||
|
|
||||
|
# 2. Fetch all relational metadata from PostgreSQL |
||||
|
print("📥 Fetching relational metadata (Titles, Authors, Categories) from Database...") |
||||
|
|
||||
|
query = text(""" |
||||
|
SELECT |
||||
|
wc.id as content_id, |
||||
|
wc.wiki_id, |
||||
|
wc.language as lang_code, |
||||
|
w.title as wiki_titles, |
||||
|
a.id as author_id, |
||||
|
a.name as author_names, |
||||
|
c.id as category_id, |
||||
|
c.name as cat_names |
||||
|
FROM wiki_wikicontent wc |
||||
|
JOIN wiki_wiki w ON wc.wiki_id = w.id |
||||
|
LEFT JOIN wiki_author a ON w.author_id = a.id |
||||
|
JOIN wiki_wikicategory c ON w.category_id = c.id |
||||
|
WHERE wc.id IN :ids |
||||
|
""") |
||||
|
|
||||
|
with engine.connect() as conn: |
||||
|
metadata_rows = conn.execute(query, {"ids": tuple(valid_ids)}).fetchall() |
||||
|
|
||||
|
metadata_lookup = {row.content_id: row for row in metadata_rows} |
||||
|
|
||||
|
# 3. Initialize Qdrant |
||||
|
print("🤖 Initializing Embedding Model and Qdrant Connection...") |
||||
|
embed_factory = EmbeddingFactory() |
||||
|
embedder = embed_factory.get_embedder() |
||||
|
vector_db = get_qdrant_store(embedder=embedder) |
||||
|
active_collection = vector_db.collection |
||||
|
|
||||
|
# We need to track the model name to update the database later |
||||
|
model_json_str = f'["{active_collection}"]' |
||||
|
|
||||
|
documents_to_upsert = [] |
||||
|
processed_count = 0 |
||||
|
|
||||
|
print(f"🚀 Starting batch embedding into collection: {active_collection}") |
||||
|
|
||||
|
# 4. Process valid Excel rows and build Agno Documents |
||||
|
for _, row in valid_df.iterrows(): |
||||
|
content_id = row['id'] |
||||
|
clean_text = row['clean_text'] |
||||
|
|
||||
|
db_meta = metadata_lookup.get(content_id) |
||||
|
if not db_meta: |
||||
|
continue |
||||
|
|
||||
|
# Extract strictly Persian strings |
||||
|
wiki_title = get_text_from_json(db_meta.wiki_titles, 'fa') |
||||
|
author_name = get_text_from_json(db_meta.author_names, 'fa') |
||||
|
category_name = get_text_from_json(db_meta.cat_names, 'fa') |
||||
|
|
||||
|
# Part A: The Narrative String |
||||
|
narrative_text = ( |
||||
|
f"CATEGORY: {category_name}\n" |
||||
|
f"WIKI TITLE: {wiki_title}\n" |
||||
|
f"AUTHOR: {author_name}\n" |
||||
|
f"CONTENT:\n{clean_text}" |
||||
|
) |
||||
|
|
||||
|
# Part B: The Strict Payload |
||||
|
payload = { |
||||
|
"source_type": "WIKI", |
||||
|
"content_id": content_id, |
||||
|
"wiki_id": db_meta.wiki_id, |
||||
|
"wiki_title": wiki_title, |
||||
|
"category_id": db_meta.category_id, |
||||
|
"category_name": category_name, |
||||
|
"author_id": db_meta.author_id, # Will be None if missing, which is fine |
||||
|
"author_name": author_name, |
||||
|
"language": db_meta.lang_code |
||||
|
} |
||||
|
|
||||
|
# Deterministic Hash ID |
||||
|
hash_id = hashlib.md5(f"WIKI_{content_id}_{active_collection}".encode()).hexdigest() |
||||
|
qdrant_id = str(uuid.UUID(hash_id)) |
||||
|
|
||||
|
doc = Document( |
||||
|
id=qdrant_id, |
||||
|
content=narrative_text, |
||||
|
meta_data=payload |
||||
|
) |
||||
|
documents_to_upsert.append(doc) |
||||
|
|
||||
|
# 5. Batch Upsert to Qdrant |
||||
|
if len(documents_to_upsert) >= BATCH_SIZE: |
||||
|
vector_db.upsert(documents=documents_to_upsert) |
||||
|
processed_count += len(documents_to_upsert) |
||||
|
print(f"✅ Embedded {processed_count}/{total_docs} vectors...") |
||||
|
documents_to_upsert = [] |
||||
|
|
||||
|
# Flush final partial batch |
||||
|
if documents_to_upsert: |
||||
|
vector_db.upsert(documents=documents_to_upsert) |
||||
|
processed_count += len(documents_to_upsert) |
||||
|
print(f"✅ Embedded {processed_count}/{total_docs} vectors...") |
||||
|
|
||||
|
# 6. Mark as synced in PostgreSQL |
||||
|
print("🔄 Updating PostgreSQL to mark items as embedded...") |
||||
|
with engine.begin() as conn: # Use .begin() for automatic transaction commit |
||||
|
update_sql = text(""" |
||||
|
UPDATE wiki_wikicontent |
||||
|
SET embedded_in = CAST(COALESCE(embedded_in, '[]') AS jsonb) || CAST(:model_json AS jsonb) |
||||
|
WHERE id IN :ids AND NOT (COALESCE(embedded_in, '[]')::jsonb @> :model_json::jsonb) |
||||
|
""") |
||||
|
conn.execute(update_sql, {"model_json": model_json_str, "ids": tuple(valid_ids)}) |
||||
|
|
||||
|
print("🎉 All documents successfully embedded and database updated!") |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
run_qdrant_ingestion() |
||||
@ -0,0 +1,144 @@ |
|||||
|
# src/scripts/jina_recovery_processor.py |
||||
|
import time |
||||
|
import requests |
||||
|
import pandas as pd |
||||
|
from sqlalchemy import text |
||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
||||
|
import os |
||||
|
from urllib.parse import quote_plus |
||||
|
from sqlalchemy import create_engine |
||||
|
from dotenv import load_dotenv |
||||
|
import threading |
||||
|
|
||||
|
load_dotenv() |
||||
|
|
||||
|
# --- Database Setup --- |
||||
|
db_user = quote_plus(os.getenv("DB_USER")) |
||||
|
db_pass = quote_plus(os.getenv("DB_PASSWORD")) |
||||
|
db_host = os.getenv("DB_HOST") |
||||
|
db_port = os.getenv("DB_PORT") |
||||
|
db_name = os.getenv("DB_NAME") |
||||
|
db_url = f"postgresql+psycopg://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}" |
||||
|
|
||||
|
engine = create_engine( |
||||
|
db_url, |
||||
|
pool_pre_ping=True, |
||||
|
pool_recycle=3600, |
||||
|
pool_size=10, |
||||
|
max_overflow=20 |
||||
|
) |
||||
|
|
||||
|
JINA_API_KEY = os.getenv("JINA_API_KEY") |
||||
|
|
||||
|
# --- Settings --- |
||||
|
EXCEL_FILE = "wiki_lang69_cleaned_final.xlsx" |
||||
|
OUTPUT_FILE = "wiki_lang69_cleaned_final_final.xlsx" # We save to a new file to be 100% safe |
||||
|
MAX_WORKERS = 5 |
||||
|
RATE_LIMIT_DELAY = 0.42 |
||||
|
|
||||
|
counter_lock = threading.Lock() |
||||
|
completed_tasks = 0 |
||||
|
total_tasks = 0 |
||||
|
|
||||
|
def process_html(row): |
||||
|
"""Worker function to retry the API call""" |
||||
|
global completed_tasks, total_tasks |
||||
|
|
||||
|
headers = { |
||||
|
"Authorization": f"Bearer {JINA_API_KEY}", |
||||
|
"Accept": "application/json" |
||||
|
} |
||||
|
|
||||
|
files = { |
||||
|
'file': (f'document_{row.id}.html', row.content, 'text/html') |
||||
|
} |
||||
|
|
||||
|
result_data = None |
||||
|
try: |
||||
|
response = requests.post("https://r.jina.ai/", headers=headers, files=files, timeout=30) |
||||
|
response.raise_for_status() |
||||
|
|
||||
|
jina_data = response.json().get('data', {}) |
||||
|
result_data = { |
||||
|
'id': row.id, |
||||
|
'clean_text': jina_data.get('content', '') |
||||
|
} |
||||
|
except Exception as e: |
||||
|
error_msg = str(e) |
||||
|
if hasattr(e, 'response') and e.response is not None: |
||||
|
error_msg += f" | {e.response.text}" |
||||
|
|
||||
|
result_data = { |
||||
|
'id': row.id, |
||||
|
'clean_text': '', |
||||
|
'error': error_msg |
||||
|
} |
||||
|
|
||||
|
with counter_lock: |
||||
|
completed_tasks += 1 |
||||
|
if 'error' in result_data: |
||||
|
print(f"⚠️ [Error] [{completed_tasks}/{total_tasks}] ID {result_data['id']}: {result_data['error']}") |
||||
|
else: |
||||
|
print(f"✅ [{completed_tasks}/{total_tasks}] Successfully recovered ID {result_data['id']}") |
||||
|
|
||||
|
return result_data |
||||
|
|
||||
|
def run_recovery(): |
||||
|
global total_tasks |
||||
|
|
||||
|
print(f"📂 Loading {EXCEL_FILE}...") |
||||
|
try: |
||||
|
df = pd.read_excel(EXCEL_FILE) |
||||
|
except FileNotFoundError: |
||||
|
print(f"❌ Could not find {EXCEL_FILE}. Make sure the name is correct!") |
||||
|
return |
||||
|
|
||||
|
# 1. Identify which rows failed or are missing text |
||||
|
failed_mask = df['error'].notna() | df['clean_text'].isna() | (df['clean_text'] == '') |
||||
|
failed_ids = df[failed_mask]['id'].tolist() |
||||
|
|
||||
|
total_tasks = len(failed_ids) |
||||
|
print(f"📊 Found {total_tasks} failed rows to recover.") |
||||
|
|
||||
|
if total_tasks == 0: |
||||
|
print("✅ Everything is already successfully processed!") |
||||
|
return |
||||
|
|
||||
|
# 2. Fetch ONLY the failed HTML contents from the database |
||||
|
print("📥 Fetching pending HTML content from database...") |
||||
|
ids_string = ','.join(map(str, failed_ids)) |
||||
|
|
||||
|
with engine.connect() as conn: |
||||
|
query = text(f"SELECT id, content FROM wiki_wikicontent WHERE id IN ({ids_string})") |
||||
|
rows = conn.execute(query).fetchall() |
||||
|
|
||||
|
print(f"🚀 Starting Multi-threaded RECOVERY (Speed: ~142 req/min)...") |
||||
|
|
||||
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: |
||||
|
future_to_row = {} |
||||
|
|
||||
|
# Dispatch tasks |
||||
|
for row in rows: |
||||
|
future = executor.submit(process_html, row) |
||||
|
future_to_row[future] = row |
||||
|
time.sleep(RATE_LIMIT_DELAY) |
||||
|
|
||||
|
# 3. Process results and dynamically update the Pandas DataFrame |
||||
|
for future in as_completed(future_to_row): |
||||
|
result = future.result() |
||||
|
|
||||
|
if 'error' in result: |
||||
|
# Update the specific row with the new error |
||||
|
df.loc[df['id'] == result['id'], 'error'] = result['error'] |
||||
|
df.loc[df['id'] == result['id'], 'clean_text'] = '' |
||||
|
else: |
||||
|
# Inject the clean text and wipe the error completely! |
||||
|
df.loc[df['id'] == result['id'], 'clean_text'] = result['clean_text'] |
||||
|
df.loc[df['id'] == result['id'], 'error'] = None |
||||
|
|
||||
|
# 4. Save the updated data |
||||
|
df.to_excel(OUTPUT_FILE, index=False) |
||||
|
print(f"🎉 Recovery Complete! Saved updated data to {OUTPUT_FILE}") |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
run_recovery() |
||||
@ -0,0 +1,139 @@ |
|||||
|
# src/scripts/jina_batch_processor.py |
||||
|
import time |
||||
|
import requests |
||||
|
import pandas as pd |
||||
|
from sqlalchemy import text |
||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
||||
|
import os |
||||
|
from urllib.parse import quote_plus |
||||
|
from sqlalchemy import create_engine |
||||
|
from dotenv import load_dotenv |
||||
|
import threading # 🟢 NEW: Needed for the thread-safe counter |
||||
|
|
||||
|
load_dotenv() |
||||
|
|
||||
|
db_user = os.getenv("DB_USER") |
||||
|
db_pass = os.getenv("DB_PASSWORD") |
||||
|
db_host = os.getenv("DB_HOST") |
||||
|
db_port = os.getenv("DB_PORT") |
||||
|
db_name = os.getenv("DB_NAME") |
||||
|
|
||||
|
db_pass = quote_plus(db_pass) |
||||
|
db_user = quote_plus(db_user) |
||||
|
db_url = f"postgresql+psycopg://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}" |
||||
|
|
||||
|
engine = create_engine( |
||||
|
db_url, |
||||
|
pool_pre_ping=True, |
||||
|
pool_recycle=3600, |
||||
|
pool_size=10, |
||||
|
max_overflow=20 |
||||
|
) |
||||
|
|
||||
|
JINA_API_KEY = os.getenv("JINA_API_KEY") |
||||
|
|
||||
|
# ⚙️ Concurrency & Rate Limit Settings |
||||
|
MAX_WORKERS = 5 |
||||
|
RATE_LIMIT_DELAY = 0.42 |
||||
|
|
||||
|
# 🟢 NEW: Global variables to track progress safely across threads |
||||
|
counter_lock = threading.Lock() |
||||
|
completed_tasks = 0 |
||||
|
total_tasks = 0 |
||||
|
|
||||
|
def process_html(row): |
||||
|
"""Worker function that runs in parallel to process a single row""" |
||||
|
global completed_tasks, total_tasks |
||||
|
|
||||
|
headers = { |
||||
|
"Authorization": f"Bearer {JINA_API_KEY}", |
||||
|
"Accept": "application/json" |
||||
|
} |
||||
|
|
||||
|
files = { |
||||
|
'file': (f'document_{row.id}.html', row.content, 'text/html') |
||||
|
} |
||||
|
|
||||
|
result_data = None |
||||
|
try: |
||||
|
response = requests.post("https://r.jina.ai/", headers=headers, files=files, timeout=30) |
||||
|
response.raise_for_status() |
||||
|
|
||||
|
jina_data = response.json().get('data', {}) |
||||
|
result_data = { |
||||
|
'id': row.id, |
||||
|
'clean_text': jina_data.get('content', '') |
||||
|
} |
||||
|
except Exception as e: |
||||
|
error_msg = str(e) |
||||
|
if hasattr(e, 'response') and e.response is not None: |
||||
|
error_msg += f" | {e.response.text}" |
||||
|
|
||||
|
result_data = { |
||||
|
'id': row.id, |
||||
|
'clean_text': '', |
||||
|
'error': error_msg |
||||
|
} |
||||
|
|
||||
|
# 🟢 THE FIX: Safely update the counter and print instantly! |
||||
|
with counter_lock: |
||||
|
completed_tasks += 1 |
||||
|
if 'error' in result_data: |
||||
|
print(f"⚠️ [Error] [{completed_tasks}/{total_tasks}] ID {result_data['id']}: {result_data['error']}") |
||||
|
else: |
||||
|
print(f"✅ [{completed_tasks}/{total_tasks}] Successfully processed ID {result_data['id']}") |
||||
|
|
||||
|
return result_data |
||||
|
|
||||
|
def run_threaded_extraction(): |
||||
|
global total_tasks |
||||
|
|
||||
|
print("📥 Fetching Wiki HTML content (Language ID: 69)...") |
||||
|
with engine.connect() as conn: |
||||
|
query = text("SELECT id, content FROM wiki_wikicontent WHERE language_id = 69") |
||||
|
rows = conn.execute(query).fetchall() |
||||
|
|
||||
|
total_tasks = len(rows) |
||||
|
print(f"📊 Found {total_tasks} rows to process.") |
||||
|
|
||||
|
if total_tasks == 0: |
||||
|
return |
||||
|
|
||||
|
extracted_data = [] |
||||
|
print(f"🚀 Starting Multi-threaded processing (Speed: ~142 req/min)...") |
||||
|
|
||||
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: |
||||
|
future_to_row = {} |
||||
|
|
||||
|
# 1. THE DISPATCHER (Main Thread) |
||||
|
for idx, row in enumerate(rows): |
||||
|
future = executor.submit(process_html, row) |
||||
|
future_to_row[future] = row |
||||
|
|
||||
|
# 🟢 Optional: Print a single line that overwrites itself just so you know |
||||
|
# the dispatcher is actively looping in the background. |
||||
|
print(f"⏳ Dispatching task {idx + 1}/{total_tasks} to thread pool...", end="\r") |
||||
|
|
||||
|
# The throttle |
||||
|
time.sleep(RATE_LIMIT_DELAY) |
||||
|
|
||||
|
print("\n🏁 All tasks dispatched! Waiting for final threads to finish...") |
||||
|
|
||||
|
# 2. THE RECEIVER (Main Thread collecting the returns) |
||||
|
# We removed the print statement from here because the workers handle it now |
||||
|
for future in as_completed(future_to_row): |
||||
|
extracted_data.append(future.result()) |
||||
|
|
||||
|
# 3. EXPORT TO EXCEL |
||||
|
if extracted_data: |
||||
|
df = pd.DataFrame(extracted_data) |
||||
|
excel_filename = "wiki_lang69_cleaned.xlsx" |
||||
|
|
||||
|
if 'error' in df.columns: |
||||
|
df = df[['id', 'clean_text', 'error']] |
||||
|
|
||||
|
df.to_excel(excel_filename, index=False) |
||||
|
print(f"🎉 Success! Saved {len(extracted_data)} rows to {excel_filename}") |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
run_threaded_extraction() |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue