223 lines
8.4 KiB
Python
Executable file
223 lines
8.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Fix Corrupted Character Data in CMC Database
|
||
Uses Python to properly handle UTF-8 encoding
|
||
"""
|
||
|
||
import subprocess
|
||
import sys
|
||
|
||
DB_USER = "root"
|
||
DB_PASS = "secureRootPassword"
|
||
DB_NAME = "cmc"
|
||
CONTAINER = "cmc-db"
|
||
|
||
# Mapping of corrupted patterns to correct values
|
||
CORRUPTION_PATTERNS = {
|
||
# Smart quotes and punctuation
|
||
'\u00e2\u0080\u0099': '\u2019', # ’ → '
|
||
'\u00e2\u0080\u009c': '\u201c', # “ → "
|
||
'\u00e2\u0080\u009d': '\u201d', # †→ "
|
||
'\u00e2\u0080\u0093': '\u2013', # â€" → –
|
||
'\u00e2\u0080\u0094': '\u2014', # â€" → —
|
||
'\u00e2\u0080\u008b': '', # ​ → (zero-width space, remove)
|
||
'\u00e2\u0080\u0083': ' ', #   → (em space, replace with regular space)
|
||
|
||
# Accented characters
|
||
'\u00c3\u00a9': '\u00e9', # é → é
|
||
'\u00c3\u0089': '\u00c9', # É → É
|
||
'\u00c3\u00b3': '\u00f3', # ó → ó
|
||
'\u00c3\u00ad': '\u00ed', # Ã → í
|
||
'\u00c3\u00a7': '\u00e7', # ç → ç
|
||
'\u00c3\u00bc': '\u00fc', # ü → ü
|
||
'\u00c3\u00a1': '\u00e1', # á → á
|
||
'\u00c3\u00b1': '\u00f1', # ñ → ñ
|
||
'\u00c3\u00b6': '\u00f6', # ö → ö
|
||
'\u00c3\u00b4': '\u00f4', # ô → ô
|
||
'\u00c3\u009f': '\u00df', # ß → ß (German sharp s)
|
||
|
||
# Turkish characters
|
||
'\u00c4\u00b0': '\u0130', # İ → İ
|
||
'\u00c5\u009e': '\u015e', # Å → Ş
|
||
'\u00c4\u009e': '\u011e', # Ä → Ğ
|
||
|
||
# Czech characters
|
||
'\u00c4\u008c': '\u010c', # Ä → Č
|
||
'\u00c4\u009b': '\u011b', # Ä› → ě
|
||
'\u00c4\u008d': '\u010d', # Ä → č
|
||
'\u00c5\u00be': '\u017e', # ž → ž
|
||
'\u00c5\u00a1': '\u0161', # Å¡ → š
|
||
'\u00c5\u0099': '\u0159', # Å™ → ř
|
||
'\u00c5\u0088': '\u0148', # Å → ň
|
||
|
||
# Symbols
|
||
'\u00c2\u00b0': '\u00b0', # ° → ° (degree)
|
||
'\u00c2\u00ae': '\u00ae', # ® → ® (registered)
|
||
'\u00e2\u0084\u00a2': '\u2122', # â„¢ → ™ (trademark)
|
||
}
|
||
|
||
def run_sql(sql):
|
||
"""Execute SQL command via docker"""
|
||
cmd = [
|
||
'docker', 'exec', CONTAINER,
|
||
'mariadb',
|
||
'-u', DB_USER,
|
||
f'-p{DB_PASS}',
|
||
'--default-character-set=utf8mb4',
|
||
DB_NAME,
|
||
'-e', sql
|
||
]
|
||
|
||
try:
|
||
result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
|
||
if result.returncode != 0:
|
||
print(f"Error executing SQL: {result.stderr}", file=sys.stderr)
|
||
return False
|
||
return True
|
||
except Exception as e:
|
||
print(f"Exception: {e}", file=sys.stderr)
|
||
return False
|
||
|
||
def fix_table_column(table, column, patterns):
|
||
"""Fix corruption patterns in a specific table column"""
|
||
print(f"Fixing {table}.{column}...")
|
||
|
||
for corrupted, correct in patterns.items():
|
||
# Escape single quotes for SQL
|
||
corrupted_esc = corrupted.replace("'", "''")
|
||
correct_esc = correct.replace("'", "''")
|
||
|
||
sql = f"""
|
||
UPDATE {table}
|
||
SET {column} = REPLACE({column}, '{corrupted_esc}', '{correct_esc}')
|
||
WHERE {column} LIKE CONCAT('%', '{corrupted_esc}', '%');
|
||
"""
|
||
|
||
if not run_sql(sql):
|
||
print(f" Warning: Failed to fix pattern in {table}.{column}")
|
||
|
||
print(f" ✓ {table}.{column} patterns fixed")
|
||
|
||
def fix_html_entities():
|
||
"""Fix HTML entities that shouldn't be in the database"""
|
||
print("\nFixing HTML entities in products table...")
|
||
|
||
entities = {
|
||
'&': '&',
|
||
' ': ' ',
|
||
'–': '–',
|
||
'—': '—',
|
||
'”': '"',
|
||
'“': '"',
|
||
'’': ''',
|
||
'‘': ''',
|
||
'°': '°',
|
||
'½': '½',
|
||
'¼': '¼',
|
||
'¾': '¾',
|
||
'×': '×',
|
||
}
|
||
|
||
for table_col in [('products', 'title'), ('products', 'description'), ('products', 'item_description')]:
|
||
table, column = table_col
|
||
for entity, char in entities.items():
|
||
sql = f"""
|
||
UPDATE {table}
|
||
SET {column} = REPLACE({column}, '{entity}', '{char}')
|
||
WHERE {column} LIKE '%{entity}%';
|
||
"""
|
||
run_sql(sql)
|
||
|
||
print(" ✓ HTML entities fixed")
|
||
|
||
def fix_specific_records():
|
||
"""Fix specific known corrupted records"""
|
||
print("\nFixing specific known records...")
|
||
|
||
fixes = [
|
||
("customers", 253, "name", "SOLUZÉ CIVIL ENGINEERS Trading Under SOLUZE PTY LTD"),
|
||
("customers", 1006, "name", "Dr. Prem's Molecules Private Limited (DPMolecules)"),
|
||
("customers", 1387, "name", 'DEE ENTERPRISES (QLD) PTY LTD trading as "MEKLEK"'),
|
||
("customers", 1608, "name", "Guidera O'Connor Pty Ltd"),
|
||
("customers", 2174, "name", "Ingredion ANZ Pty Ltd"),
|
||
("customers", 2215, "name", "Vale Nouvelle-Calédonie S.A.S"),
|
||
("customers", 2375, "name", "Evaluación y Control Ambiental S.A.S."),
|
||
("customers", 3143, "name", "Zontahevy Comércio e Serviços Offshore Ltda"),
|
||
("customers", 3529, "name", "Société des Mines de Syama"),
|
||
("customers", 4325, "name", "Rambøll Danmark"),
|
||
("customers", 4350, "name", "SONNIVA ENERGY MAKİNA İNŞAAT İTH. İHR. LTD. ŞTİ."),
|
||
("customers", 4669, "name", "F.C.C. Fluides Conseils Calédonie SARL"),
|
||
("customers", 4743, "name", "DGPack Prostějov"),
|
||
("customers", 4893, "name", "Oxiquímica, S.A.P.I. de C.V."),
|
||
|
||
("principles", 2, "address", "Heinz-Fangman-Straße 18"),
|
||
("principles", 9, "address", "Bezručova 2901\\n756 61 Rožnov pod Radhoštěm"),
|
||
("principles", 9, "city", "Rožnov pod Radhoštěm"),
|
||
("principles", 13, "name", "IEP Technologies GmbH - BRILEX Gesellschaft für Explosionsschutz mbH"),
|
||
("principles", 14, "address", "Liebigstraße 2"),
|
||
("principles", 58, "name", "Nosia S.r.l.- Señalización Industrial"),
|
||
("principles", 65, "address", "Alte Emser Straße 32"),
|
||
|
||
("addresses", 19, "address", "Lvl 3, Building B, 7–11 Talavera Road"),
|
||
|
||
("products", 30, "title", "C95SN189C DSTGL40/C-Digital temperature probe for P8xx1 Web Sensor. Range -30 to +80°C. With CINCH connector, 1m Cable"),
|
||
("products", 38, "title", "Mid-West Instrument Model 240 – SC – 02 – O(TT)"),
|
||
("products", 67, "title", "Newson Gale Earth-Rite® II FIBC Static Earthing System, GRP Enclosure with 5m 2 Core Spiral Cable and FIBC Clamp"),
|
||
("products", 76, "title", "Newson Gale Earth-Rite® RTR Tester - ER2/CRT"),
|
||
("products", 85, "title", "Newson Gale Earth-Rite® RTR™ Tri-Mode Static Grounding System, Metal Enclosure, X90-IP Heavy Duty Clamp with 10m 2 Core Spiral Cable"),
|
||
]
|
||
|
||
for table, record_id, column, value in fixes:
|
||
value_esc = value.replace("'", "''")
|
||
sql = f"UPDATE {table} SET {column} = '{value_esc}' WHERE id = {record_id};"
|
||
run_sql(sql)
|
||
|
||
print(f" ✓ {len(fixes)} specific records fixed")
|
||
|
||
def verify():
|
||
"""Verify the fixes"""
|
||
print("\n" + "="*60)
|
||
print("Verification Results")
|
||
print("="*60)
|
||
|
||
run_sql("SELECT 'Remaining corrupted customers:', COUNT(*) FROM customers WHERE name REGEXP '[^ -~]';")
|
||
run_sql("SELECT 'Remaining corrupted principles:', COUNT(*) FROM principles WHERE name REGEXP '[^ -~]' OR address REGEXP '[^ -~]';")
|
||
run_sql("SELECT 'Remaining corrupted addresses:', COUNT(*) FROM addresses WHERE address REGEXP '[^ -~]';")
|
||
run_sql("SELECT 'Remaining corrupted products:', COUNT(*) FROM products WHERE title REGEXP '[^ -~]' OR description REGEXP '[^ -~]';")
|
||
|
||
def main():
|
||
print("="*60)
|
||
print("CMC Database Character Corruption Fix")
|
||
print("="*60)
|
||
print("")
|
||
|
||
# Set charset
|
||
run_sql("SET NAMES utf8mb4;")
|
||
|
||
# Fix pattern-based corruption
|
||
fix_table_column("customers", "name", CORRUPTION_PATTERNS)
|
||
fix_table_column("principles", "name", CORRUPTION_PATTERNS)
|
||
fix_table_column("principles", "address", CORRUPTION_PATTERNS)
|
||
fix_table_column("principles", "city", CORRUPTION_PATTERNS)
|
||
fix_table_column("addresses", "address", CORRUPTION_PATTERNS)
|
||
fix_table_column("addresses", "city", CORRUPTION_PATTERNS)
|
||
fix_table_column("products", "title", CORRUPTION_PATTERNS)
|
||
fix_table_column("products", "description", CORRUPTION_PATTERNS)
|
||
fix_table_column("products", "item_description", CORRUPTION_PATTERNS)
|
||
|
||
# Fix HTML entities
|
||
fix_html_entities()
|
||
|
||
# Fix specific records
|
||
fix_specific_records()
|
||
|
||
# Verify
|
||
verify()
|
||
|
||
print("\n" + "="*60)
|
||
print("Fix complete!")
|
||
print("="*60)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|