You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
115 lines
3.3 KiB
115 lines
3.3 KiB
import json
|
|
import re
|
|
import sys
|
|
|
|
def detect_language(text):
|
|
"""
|
|
Detect language code based on text content.
|
|
- Cyrillic characters -> 'ru'
|
|
- Arabic/Persian characters -> 'fa'
|
|
- Default -> 'en'
|
|
"""
|
|
if not text or not isinstance(text, str):
|
|
return 'en'
|
|
|
|
# Check for Cyrillic (Russian)
|
|
if re.search(r'[а-яА-Я]', text):
|
|
return 'ru'
|
|
|
|
# Check for Arabic/Persian script
|
|
if re.search(r'[\u0600-\u06FF]', text):
|
|
return 'fa'
|
|
|
|
# Default to English
|
|
return 'en'
|
|
|
|
def reformat_data(input_file, output_file):
|
|
print(f"📖 Reading {input_file}...")
|
|
|
|
try:
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
except FileNotFoundError:
|
|
print(f"❌ Error: File '{input_file}' not found.")
|
|
return
|
|
except json.JSONDecodeError as e:
|
|
print(f"❌ Error: Failed to decode JSON. {e}")
|
|
return
|
|
|
|
processed_count = 0
|
|
|
|
# Configuration based on your request
|
|
TARGETS = {
|
|
'hadis.narratorlayer': [
|
|
'name',
|
|
'description'
|
|
],
|
|
'hadis.transmitters': [
|
|
'full_name',
|
|
'kunya',
|
|
'known_as',
|
|
'nickname',
|
|
'origin',
|
|
'lived_in',
|
|
'died_in',
|
|
'description'
|
|
],
|
|
'hadis.transmitteropinion': [
|
|
'scholar_name',
|
|
'opinion_text'
|
|
],
|
|
'hadis.transmitteroriginaltext': [
|
|
'title',
|
|
'text'
|
|
],
|
|
}
|
|
|
|
for record in data:
|
|
model = record.get('model')
|
|
if model in TARGETS:
|
|
fields = record.get('fields', {})
|
|
target_fields = TARGETS[model]
|
|
|
|
for field in target_fields:
|
|
if field in fields:
|
|
original_value = fields[field]
|
|
|
|
# Case 1: Value is None/Null -> Empty List
|
|
if original_value is None:
|
|
fields[field] = []
|
|
continue
|
|
|
|
# Case 2: Value is String -> Convert to JSON Format
|
|
if isinstance(original_value, str):
|
|
# Detect language
|
|
lang_code = detect_language(original_value)
|
|
|
|
# Reformat
|
|
fields[field] = [
|
|
{
|
|
"text": original_value,
|
|
"language_code": lang_code
|
|
}
|
|
]
|
|
|
|
# Case 3: Already a list -> Skip
|
|
elif isinstance(original_value, list):
|
|
continue
|
|
|
|
processed_count += 1
|
|
|
|
print(f"✅ Processed {processed_count} records.")
|
|
|
|
try:
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
print(f"💾 Saved reformatted data to: {output_file}")
|
|
except Exception as e:
|
|
print(f"❌ Error writing output file: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
# Input/Output filenames
|
|
INPUT_FILE = "transmitters_backup.json"
|
|
OUTPUT_FILE = "transmitters_reformatted.json"
|
|
|
|
reformat_data(INPUT_FILE, OUTPUT_FILE)
|