You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

115 lines
3.3 KiB

import json
import re
import sys
def detect_language(text):
"""
Detect language code based on text content.
- Cyrillic characters -> 'ru'
- Arabic/Persian characters -> 'fa'
- Default -> 'en'
"""
if not text or not isinstance(text, str):
return 'en'
# Check for Cyrillic (Russian)
if re.search(r'[а-яА-Я]', text):
return 'ru'
# Check for Arabic/Persian script
if re.search(r'[\u0600-\u06FF]', text):
return 'fa'
# Default to English
return 'en'
def reformat_data(input_file, output_file):
print(f"📖 Reading {input_file}...")
try:
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
print(f"❌ Error: File '{input_file}' not found.")
return
except json.JSONDecodeError as e:
print(f"❌ Error: Failed to decode JSON. {e}")
return
processed_count = 0
# Configuration based on your request
TARGETS = {
'hadis.narratorlayer': [
'name',
'description'
],
'hadis.transmitters': [
'full_name',
'kunya',
'known_as',
'nickname',
'origin',
'lived_in',
'died_in',
'description'
],
'hadis.transmitteropinion': [
'scholar_name',
'opinion_text'
],
'hadis.transmitteroriginaltext': [
'title',
'text'
],
}
for record in data:
model = record.get('model')
if model in TARGETS:
fields = record.get('fields', {})
target_fields = TARGETS[model]
for field in target_fields:
if field in fields:
original_value = fields[field]
# Case 1: Value is None/Null -> Empty List
if original_value is None:
fields[field] = []
continue
# Case 2: Value is String -> Convert to JSON Format
if isinstance(original_value, str):
# Detect language
lang_code = detect_language(original_value)
# Reformat
fields[field] = [
{
"text": original_value,
"language_code": lang_code
}
]
# Case 3: Already a list -> Skip
elif isinstance(original_value, list):
continue
processed_count += 1
print(f"✅ Processed {processed_count} records.")
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"💾 Saved reformatted data to: {output_file}")
except Exception as e:
print(f"❌ Error writing output file: {e}")
if __name__ == "__main__":
# Input/Output filenames
INPUT_FILE = "transmitters_backup.json"
OUTPUT_FILE = "transmitters_reformatted.json"
reformat_data(INPUT_FILE, OUTPUT_FILE)