cmc-sales/scripts/fix_corrupted_data.py
2025-12-23 15:54:26 +11:00

223 lines
8.4 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Fix Corrupted Character Data in CMC Database
Uses Python to properly handle UTF-8 encoding
"""
import subprocess
import sys
DB_USER = "root"
DB_PASS = "secureRootPassword"
DB_NAME = "cmc"
CONTAINER = "cmc-db"
# Mapping of corrupted patterns to correct values
CORRUPTION_PATTERNS = {
# Smart quotes and punctuation
'\u00e2\u0080\u0099': '\u2019', # ’ → '
'\u00e2\u0080\u009c': '\u201c', # “ → "
'\u00e2\u0080\u009d': '\u201d', # †→ "
'\u00e2\u0080\u0093': '\u2013', # â€" →
'\u00e2\u0080\u0094': '\u2014', # â€" → —
'\u00e2\u0080\u008b': '', # ​ → (zero-width space, remove)
'\u00e2\u0080\u0083': ' ', #   → (em space, replace with regular space)
# Accented characters
'\u00c3\u00a9': '\u00e9', # é → é
'\u00c3\u0089': '\u00c9', # É → É
'\u00c3\u00b3': '\u00f3', # ó → ó
'\u00c3\u00ad': '\u00ed', # Ã → í
'\u00c3\u00a7': '\u00e7', # ç → ç
'\u00c3\u00bc': '\u00fc', # ü → ü
'\u00c3\u00a1': '\u00e1', # á → á
'\u00c3\u00b1': '\u00f1', # ñ → ñ
'\u00c3\u00b6': '\u00f6', # ö → ö
'\u00c3\u00b4': '\u00f4', # ô → ô
'\u00c3\u009f': '\u00df', # ß → ß (German sharp s)
# Turkish characters
'\u00c4\u00b0': '\u0130', # İ → İ
'\u00c5\u009e': '\u015e', # Å → Ş
'\u00c4\u009e': '\u011e', # Ä → Ğ
# Czech characters
'\u00c4\u008c': '\u010c', # Ä → Č
'\u00c4\u009b': '\u011b', # Ä› → ě
'\u00c4\u008d': '\u010d', # Ä → č
'\u00c5\u00be': '\u017e', # ž → ž
'\u00c5\u00a1': '\u0161', # Å¡ → š
'\u00c5\u0099': '\u0159', # Å™ → ř
'\u00c5\u0088': '\u0148', # Å → ň
# Symbols
'\u00c2\u00b0': '\u00b0', # ° → ° (degree)
'\u00c2\u00ae': '\u00ae', # ® → ® (registered)
'\u00e2\u0084\u00a2': '\u2122', # â„¢ → ™ (trademark)
}
def run_sql(sql):
"""Execute SQL command via docker"""
cmd = [
'docker', 'exec', CONTAINER,
'mariadb',
'-u', DB_USER,
f'-p{DB_PASS}',
'--default-character-set=utf8mb4',
DB_NAME,
'-e', sql
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
if result.returncode != 0:
print(f"Error executing SQL: {result.stderr}", file=sys.stderr)
return False
return True
except Exception as e:
print(f"Exception: {e}", file=sys.stderr)
return False
def fix_table_column(table, column, patterns):
"""Fix corruption patterns in a specific table column"""
print(f"Fixing {table}.{column}...")
for corrupted, correct in patterns.items():
# Escape single quotes for SQL
corrupted_esc = corrupted.replace("'", "''")
correct_esc = correct.replace("'", "''")
sql = f"""
UPDATE {table}
SET {column} = REPLACE({column}, '{corrupted_esc}', '{correct_esc}')
WHERE {column} LIKE CONCAT('%', '{corrupted_esc}', '%');
"""
if not run_sql(sql):
print(f" Warning: Failed to fix pattern in {table}.{column}")
print(f"{table}.{column} patterns fixed")
def fix_html_entities():
"""Fix HTML entities that shouldn't be in the database"""
print("\nFixing HTML entities in products table...")
entities = {
'&': '&',
' ': ' ',
'–': '',
'—': '',
'”': '"',
'“': '"',
'’': ''',
'‘': ''',
'°': '°',
'½': '½',
'¼': '¼',
'¾': '¾',
'×': '×',
}
for table_col in [('products', 'title'), ('products', 'description'), ('products', 'item_description')]:
table, column = table_col
for entity, char in entities.items():
sql = f"""
UPDATE {table}
SET {column} = REPLACE({column}, '{entity}', '{char}')
WHERE {column} LIKE '%{entity}%';
"""
run_sql(sql)
print(" ✓ HTML entities fixed")
def fix_specific_records():
"""Fix specific known corrupted records"""
print("\nFixing specific known records...")
fixes = [
("customers", 253, "name", "SOLUZÉ CIVIL ENGINEERS Trading Under SOLUZE PTY LTD"),
("customers", 1006, "name", "Dr. Prem's Molecules Private Limited (DPMolecules)"),
("customers", 1387, "name", 'DEE ENTERPRISES (QLD) PTY LTD trading as "MEKLEK"'),
("customers", 1608, "name", "Guidera O'Connor Pty Ltd"),
("customers", 2174, "name", "Ingredion ANZ Pty Ltd"),
("customers", 2215, "name", "Vale Nouvelle-Calédonie S.A.S"),
("customers", 2375, "name", "Evaluación y Control Ambiental S.A.S."),
("customers", 3143, "name", "Zontahevy Comércio e Serviços Offshore Ltda"),
("customers", 3529, "name", "Société des Mines de Syama"),
("customers", 4325, "name", "Rambøll Danmark"),
("customers", 4350, "name", "SONNIVA ENERGY MAKİNA İNŞAAT İTH. İHR. LTD. ŞTİ."),
("customers", 4669, "name", "F.C.C. Fluides Conseils Calédonie SARL"),
("customers", 4743, "name", "DGPack Prostějov"),
("customers", 4893, "name", "Oxiquímica, S.A.P.I. de C.V."),
("principles", 2, "address", "Heinz-Fangman-Straße 18"),
("principles", 9, "address", "Bezručova 2901\\n756 61 Rožnov pod Radhoštěm"),
("principles", 9, "city", "Rožnov pod Radhoštěm"),
("principles", 13, "name", "IEP Technologies GmbH - BRILEX Gesellschaft für Explosionsschutz mbH"),
("principles", 14, "address", "Liebigstraße 2"),
("principles", 58, "name", "Nosia S.r.l.- Señalización Industrial"),
("principles", 65, "address", "Alte Emser Straße 32"),
("addresses", 19, "address", "Lvl 3, Building B, 711 Talavera Road"),
("products", 30, "title", "C95SN189C DSTGL40/C-Digital temperature probe for P8xx1 Web Sensor. Range -30 to +80°C. With CINCH connector, 1m Cable"),
("products", 38, "title", "Mid-West Instrument Model 240 SC 02 O(TT)"),
("products", 67, "title", "Newson Gale Earth-Rite® II FIBC Static Earthing System, GRP Enclosure with 5m 2 Core Spiral Cable and FIBC Clamp"),
("products", 76, "title", "Newson Gale Earth-Rite® RTR Tester - ER2/CRT"),
("products", 85, "title", "Newson Gale Earth-Rite® RTR™ Tri-Mode Static Grounding System, Metal Enclosure, X90-IP Heavy Duty Clamp with 10m 2 Core Spiral Cable"),
]
for table, record_id, column, value in fixes:
value_esc = value.replace("'", "''")
sql = f"UPDATE {table} SET {column} = '{value_esc}' WHERE id = {record_id};"
run_sql(sql)
print(f"{len(fixes)} specific records fixed")
def verify():
"""Verify the fixes"""
print("\n" + "="*60)
print("Verification Results")
print("="*60)
run_sql("SELECT 'Remaining corrupted customers:', COUNT(*) FROM customers WHERE name REGEXP '[^ -~]';")
run_sql("SELECT 'Remaining corrupted principles:', COUNT(*) FROM principles WHERE name REGEXP '[^ -~]' OR address REGEXP '[^ -~]';")
run_sql("SELECT 'Remaining corrupted addresses:', COUNT(*) FROM addresses WHERE address REGEXP '[^ -~]';")
run_sql("SELECT 'Remaining corrupted products:', COUNT(*) FROM products WHERE title REGEXP '[^ -~]' OR description REGEXP '[^ -~]';")
def main():
print("="*60)
print("CMC Database Character Corruption Fix")
print("="*60)
print("")
# Set charset
run_sql("SET NAMES utf8mb4;")
# Fix pattern-based corruption
fix_table_column("customers", "name", CORRUPTION_PATTERNS)
fix_table_column("principles", "name", CORRUPTION_PATTERNS)
fix_table_column("principles", "address", CORRUPTION_PATTERNS)
fix_table_column("principles", "city", CORRUPTION_PATTERNS)
fix_table_column("addresses", "address", CORRUPTION_PATTERNS)
fix_table_column("addresses", "city", CORRUPTION_PATTERNS)
fix_table_column("products", "title", CORRUPTION_PATTERNS)
fix_table_column("products", "description", CORRUPTION_PATTERNS)
fix_table_column("products", "item_description", CORRUPTION_PATTERNS)
# Fix HTML entities
fix_html_entities()
# Fix specific records
fix_specific_records()
# Verify
verify()
print("\n" + "="*60)
print("Fix complete!")
print("="*60)
if __name__ == "__main__":
main()