#!/usr/bin/env python3 """ Fix Corrupted Character Data in CMC Database Uses Python to properly handle UTF-8 encoding """ import subprocess import sys DB_USER = "root" DB_PASS = "secureRootPassword" DB_NAME = "cmc" CONTAINER = "cmc-db" # Mapping of corrupted patterns to correct values CORRUPTION_PATTERNS = { # Smart quotes and punctuation '\u00e2\u0080\u0099': '\u2019', # ’ → ' '\u00e2\u0080\u009c': '\u201c', # “ → " '\u00e2\u0080\u009d': '\u201d', # †→ " '\u00e2\u0080\u0093': '\u2013', # â€" → – '\u00e2\u0080\u0094': '\u2014', # â€" → — '\u00e2\u0080\u008b': '', # ​ → (zero-width space, remove) '\u00e2\u0080\u0083': ' ', #   → (em space, replace with regular space) # Accented characters '\u00c3\u00a9': '\u00e9', # é → é '\u00c3\u0089': '\u00c9', # É → É '\u00c3\u00b3': '\u00f3', # ó → ó '\u00c3\u00ad': '\u00ed', # à → í '\u00c3\u00a7': '\u00e7', # ç → ç '\u00c3\u00bc': '\u00fc', # ü → ü '\u00c3\u00a1': '\u00e1', # á → á '\u00c3\u00b1': '\u00f1', # ñ → ñ '\u00c3\u00b6': '\u00f6', # ö → ö '\u00c3\u00b4': '\u00f4', # ô → ô '\u00c3\u009f': '\u00df', # ß → ß (German sharp s) # Turkish characters '\u00c4\u00b0': '\u0130', # İ → İ '\u00c5\u009e': '\u015e', # Å → Ş '\u00c4\u009e': '\u011e', # Ä → Ğ # Czech characters '\u00c4\u008c': '\u010c', # Ä → Č '\u00c4\u009b': '\u011b', # Ä› → ě '\u00c4\u008d': '\u010d', # Ä → č '\u00c5\u00be': '\u017e', # ž → ž '\u00c5\u00a1': '\u0161', # Å¡ → š '\u00c5\u0099': '\u0159', # Å™ → ř '\u00c5\u0088': '\u0148', # Å → ň # Symbols '\u00c2\u00b0': '\u00b0', # ° → ° (degree) '\u00c2\u00ae': '\u00ae', # ® → ® (registered) '\u00e2\u0084\u00a2': '\u2122', # â„¢ → ™ (trademark) } def run_sql(sql): """Execute SQL command via docker""" cmd = [ 'docker', 'exec', CONTAINER, 'mariadb', '-u', DB_USER, f'-p{DB_PASS}', '--default-character-set=utf8mb4', DB_NAME, '-e', sql ] try: result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8') if result.returncode != 0: print(f"Error executing SQL: {result.stderr}", file=sys.stderr) return False return True except Exception as e: print(f"Exception: {e}", file=sys.stderr) return False def fix_table_column(table, column, patterns): """Fix corruption patterns in a specific table column""" print(f"Fixing {table}.{column}...") for corrupted, correct in patterns.items(): # Escape single quotes for SQL corrupted_esc = corrupted.replace("'", "''") correct_esc = correct.replace("'", "''") sql = f""" UPDATE {table} SET {column} = REPLACE({column}, '{corrupted_esc}', '{correct_esc}') WHERE {column} LIKE CONCAT('%', '{corrupted_esc}', '%'); """ if not run_sql(sql): print(f" Warning: Failed to fix pattern in {table}.{column}") print(f" ✓ {table}.{column} patterns fixed") def fix_html_entities(): """Fix HTML entities that shouldn't be in the database""" print("\nFixing HTML entities in products table...") entities = { '&': '&', ' ': ' ', '–': '–', '—': '—', '”': '"', '“': '"', '’': ''', '‘': ''', '°': '°', '½': '½', '¼': '¼', '¾': '¾', '×': '×', } for table_col in [('products', 'title'), ('products', 'description'), ('products', 'item_description')]: table, column = table_col for entity, char in entities.items(): sql = f""" UPDATE {table} SET {column} = REPLACE({column}, '{entity}', '{char}') WHERE {column} LIKE '%{entity}%'; """ run_sql(sql) print(" ✓ HTML entities fixed") def fix_specific_records(): """Fix specific known corrupted records""" print("\nFixing specific known records...") fixes = [ ("customers", 253, "name", "SOLUZÉ CIVIL ENGINEERS Trading Under SOLUZE PTY LTD"), ("customers", 1006, "name", "Dr. Prem's Molecules Private Limited (DPMolecules)"), ("customers", 1387, "name", 'DEE ENTERPRISES (QLD) PTY LTD trading as "MEKLEK"'), ("customers", 1608, "name", "Guidera O'Connor Pty Ltd"), ("customers", 2174, "name", "Ingredion ANZ Pty Ltd"), ("customers", 2215, "name", "Vale Nouvelle-Calédonie S.A.S"), ("customers", 2375, "name", "Evaluación y Control Ambiental S.A.S."), ("customers", 3143, "name", "Zontahevy Comércio e Serviços Offshore Ltda"), ("customers", 3529, "name", "Société des Mines de Syama"), ("customers", 4325, "name", "Rambøll Danmark"), ("customers", 4350, "name", "SONNIVA ENERGY MAKİNA İNŞAAT İTH. İHR. LTD. ŞTİ."), ("customers", 4669, "name", "F.C.C. Fluides Conseils Calédonie SARL"), ("customers", 4743, "name", "DGPack Prostějov"), ("customers", 4893, "name", "Oxiquímica, S.A.P.I. de C.V."), ("principles", 2, "address", "Heinz-Fangman-Straße 18"), ("principles", 9, "address", "Bezručova 2901\\n756 61 Rožnov pod Radhoštěm"), ("principles", 9, "city", "Rožnov pod Radhoštěm"), ("principles", 13, "name", "IEP Technologies GmbH - BRILEX Gesellschaft für Explosionsschutz mbH"), ("principles", 14, "address", "Liebigstraße 2"), ("principles", 58, "name", "Nosia S.r.l.- Señalización Industrial"), ("principles", 65, "address", "Alte Emser Straße 32"), ("addresses", 19, "address", "Lvl 3, Building B, 7–11 Talavera Road"), ("products", 30, "title", "C95SN189C DSTGL40/C-Digital temperature probe for P8xx1 Web Sensor. Range -30 to +80°C. With CINCH connector, 1m Cable"), ("products", 38, "title", "Mid-West Instrument Model 240 – SC – 02 – O(TT)"), ("products", 67, "title", "Newson Gale Earth-Rite® II FIBC Static Earthing System, GRP Enclosure with 5m 2 Core Spiral Cable and FIBC Clamp"), ("products", 76, "title", "Newson Gale Earth-Rite® RTR Tester - ER2/CRT"), ("products", 85, "title", "Newson Gale Earth-Rite® RTR™ Tri-Mode Static Grounding System, Metal Enclosure, X90-IP Heavy Duty Clamp with 10m 2 Core Spiral Cable"), ] for table, record_id, column, value in fixes: value_esc = value.replace("'", "''") sql = f"UPDATE {table} SET {column} = '{value_esc}' WHERE id = {record_id};" run_sql(sql) print(f" ✓ {len(fixes)} specific records fixed") def verify(): """Verify the fixes""" print("\n" + "="*60) print("Verification Results") print("="*60) run_sql("SELECT 'Remaining corrupted customers:', COUNT(*) FROM customers WHERE name REGEXP '[^ -~]';") run_sql("SELECT 'Remaining corrupted principles:', COUNT(*) FROM principles WHERE name REGEXP '[^ -~]' OR address REGEXP '[^ -~]';") run_sql("SELECT 'Remaining corrupted addresses:', COUNT(*) FROM addresses WHERE address REGEXP '[^ -~]';") run_sql("SELECT 'Remaining corrupted products:', COUNT(*) FROM products WHERE title REGEXP '[^ -~]' OR description REGEXP '[^ -~]';") def main(): print("="*60) print("CMC Database Character Corruption Fix") print("="*60) print("") # Set charset run_sql("SET NAMES utf8mb4;") # Fix pattern-based corruption fix_table_column("customers", "name", CORRUPTION_PATTERNS) fix_table_column("principles", "name", CORRUPTION_PATTERNS) fix_table_column("principles", "address", CORRUPTION_PATTERNS) fix_table_column("principles", "city", CORRUPTION_PATTERNS) fix_table_column("addresses", "address", CORRUPTION_PATTERNS) fix_table_column("addresses", "city", CORRUPTION_PATTERNS) fix_table_column("products", "title", CORRUPTION_PATTERNS) fix_table_column("products", "description", CORRUPTION_PATTERNS) fix_table_column("products", "item_description", CORRUPTION_PATTERNS) # Fix HTML entities fix_html_entities() # Fix specific records fix_specific_records() # Verify verify() print("\n" + "="*60) print("Fix complete!") print("="*60) if __name__ == "__main__": main()