cmc-sales/scripts/fix_corrupted_data.sql
2025-12-23 15:54:26 +11:00

318 lines
16 KiB
SQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- ============================================================================
-- Fix Corrupted Character Data in CMC Database
-- ============================================================================
-- This script fixes mojibake (corrupted UTF-8 characters) in the database
-- Run after converting tables to utf8mb4
--
-- IMPORTANT: Review this script and test on a backup first!
-- Create backup: docker exec cmc-db mariadb-dump -u root -psecureRootPassword cmc | gzip > backup_before_fix.sql.gz
--
-- To run: docker exec -i cmc-db mariadb -u root -psecureRootPassword --default-character-set=utf8mb4 cmc < scripts/fix_corrupted_data.sql
-- ============================================================================
-- Create a backup table for principles (optional safety measure)
-- DROP TABLE IF EXISTS principles_backup;
-- CREATE TABLE principles_backup AS SELECT * FROM principles;
SET NAMES utf8mb4;
-- ============================================================================
-- PART 1: Global Pattern Fixes Using REPLACE
-- ============================================================================
-- These fix common mojibake patterns across all affected tables
-- ----------------------------------------------------------------------------
-- Fix CUSTOMERS table
-- ----------------------------------------------------------------------------
UPDATE customers SET name = REPLACE(name, '’', ''') WHERE name LIKE '%â%'; -- Smart apostrophe
UPDATE customers SET name = REPLACE(name, 'âœ', '"') WHERE name LIKE '%“%'; -- Left double quote
UPDATE customers SET name = REPLACE(name, 'â€', '"') WHERE name LIKE '%â%'; -- Right double quote
UPDATE customers SET name = REPLACE(name, 'â"', '') WHERE name LIKE '%â€"%'; -- En dash
UPDATE customers SET name = REPLACE(name, 'â"', '—') WHERE name LIKE '%â€"%'; -- Em dash
UPDATE customers SET name = REPLACE(name, 'â', '') WHERE name LIKE '%â%'; -- Zero-width space (remove)
UPDATE customers SET name = REPLACE(name, 'âƒ', ' ') WHERE name LIKE '%âƒ%'; -- Special space
UPDATE customers SET name = REPLACE(name, 'é', 'é') WHERE name LIKE '%é%'; -- é
UPDATE customers SET name = REPLACE(name, 'Ã', 'É') WHERE name LIKE '%Ã%'; -- É
UPDATE customers SET name = REPLACE(name, 'ó', 'ó') WHERE name LIKE '%ó%'; -- ó
UPDATE customers SET name = REPLACE(name, 'Ã', 'í') WHERE name LIKE '%Ã%'; -- í
UPDATE customers SET name = REPLACE(name, 'ç', 'ç') WHERE name LIKE '%ç%'; -- ç
UPDATE customers SET name = REPLACE(name, 'ü', 'ü') WHERE name LIKE '%ü%'; -- ü
UPDATE customers SET name = REPLACE(name, 'á', 'á') WHERE name LIKE '%á%'; -- á
UPDATE customers SET name = REPLACE(name, 'ñ', 'ñ') WHERE name LIKE '%ñ%'; -- ñ
UPDATE customers SET name = REPLACE(name, 'Ã', 'ö') WHERE name LIKE '%Ã%'; -- ö
UPDATE customers SET name = REPLACE(name, 'ô', 'ô') WHERE name LIKE '%ô%'; -- ô
UPDATE customers SET name = REPLACE(name, 'İ', 'İ') WHERE name LIKE '%İ%'; -- Turkish İ
UPDATE customers SET name = REPLACE(name, 'Å', 'Ş') WHERE name LIKE '%Å%'; -- Turkish Ş
UPDATE customers SET name = REPLACE(name, 'Ä', 'Ğ') WHERE name LIKE '%Ä%'; -- Turkish Ğ
UPDATE customers SET name = REPLACE(name, 'Ä', 'Č') WHERE name LIKE '%Ä%'; -- Czech Č
UPDATE customers SET name = REPLACE(name, 'Ä', 'ě') WHERE name LIKE '%Ä%'; -- Czech ě
-- Remove trailing tabs
UPDATE customers SET name = TRIM(TRAILING '\t' FROM name) WHERE name LIKE '%\t';
UPDATE customers SET trading_name = TRIM(TRAILING '\t' FROM trading_name) WHERE trading_name LIKE '%\t';
-- ----------------------------------------------------------------------------
-- Fix PRINCIPLES table
-- ----------------------------------------------------------------------------
UPDATE principles SET name = REPLACE(name, 'ü', 'ü') WHERE name LIKE '%ü%';
UPDATE principles SET name = REPLACE(name, 'é', 'é') WHERE name LIKE '%é%';
UPDATE principles SET name = REPLACE(name, 'ó', 'ó') WHERE name LIKE '%ó%';
UPDATE principles SET address = REPLACE(address, 'ß', 'ß') WHERE address LIKE '%ß%'; -- German ß
UPDATE principles SET address = REPLACE(address, 'ü', 'ü') WHERE address LIKE '%ü%';
UPDATE principles SET address = REPLACE(address, 'é', 'é') WHERE address LIKE '%é%';
UPDATE principles SET address = REPLACE(address, 'ñ', 'ñ') WHERE address LIKE '%ñ%';
UPDATE principles SET address = REPLACE(address, 'Ä', 'č') WHERE address LIKE '%Ä%'; -- Czech č
UPDATE principles SET address = REPLACE(address, 'ž', 'ž') WHERE address LIKE '%ž%'; -- Czech ž
UPDATE principles SET address = REPLACE(address, 'Å¡', 'š') WHERE address LIKE '%Å¡%'; -- Czech š
UPDATE principles SET address = REPLACE(address, 'Ä', 'ě') WHERE address LIKE '%Ä%'; -- Czech ě
UPDATE principles SET address = REPLACE(address, 'Å', 'ň') WHERE address LIKE '%Å%'; -- Czech ň
UPDATE principles SET address = REPLACE(address, 'Å', 'ř') WHERE address LIKE '%Å%'; -- Czech ř
UPDATE principles SET city = REPLACE(city, 'ü', 'ü') WHERE city LIKE '%ü%';
UPDATE principles SET city = REPLACE(city, 'é', 'é') WHERE city LIKE '%é%';
UPDATE principles SET city = REPLACE(city, 'ó', 'ó') WHERE city LIKE '%ó%';
-- ----------------------------------------------------------------------------
-- Fix ADDRESSES table
-- ----------------------------------------------------------------------------
UPDATE addresses SET address = REPLACE(address, 'â"', '') WHERE address LIKE '%â€"%'; -- En dash
UPDATE addresses SET address = REPLACE(address, 'â"', '—') WHERE address LIKE '%â€"%'; -- Em dash
UPDATE addresses SET address = REPLACE(address, 'é', 'é') WHERE address LIKE '%é%';
UPDATE addresses SET address = REPLACE(address, 'ü', 'ü') WHERE address LIKE '%ü%';
UPDATE addresses SET city = REPLACE(city, 'é', 'é') WHERE city LIKE '%é%';
UPDATE addresses SET city = REPLACE(city, 'ü', 'ü') WHERE city LIKE '%ü%';
-- ============================================================================
-- PART 2: Specific Record Fixes
-- ============================================================================
-- These fix specific known corrupted records where pattern matching isn't enough
-- ----------------------------------------------------------------------------
-- PRINCIPLES - Specific fixes
-- ----------------------------------------------------------------------------
-- ID 2: German ß
UPDATE principles
SET address = 'Heinz-Fangman-Straße 18'
WHERE id = 2;
-- ID 9: Czech characters (this is complex, needs full correction)
UPDATE principles
SET address = 'Bezručova 2901\n756 61 Rožnov pod Radhoštěm',
city = 'Rožnov pod Radhoštěm'
WHERE id = 9;
-- ID 13: German ü
UPDATE principles
SET name = 'IEP Technologies GmbH - BRILEX Gesellschaft für Explosionsschutz mbH'
WHERE id = 13;
-- ID 14: German ß
UPDATE principles
SET address = 'Liebigstraße 2'
WHERE id = 14;
-- ID 58: Spanish ó
UPDATE principles
SET name = 'Nosia S.r.l.- Señalización Industrial'
WHERE id = 58;
-- ID 65: German ß
UPDATE principles
SET address = 'Alte Emser Straße 32'
WHERE id = 65;
-- ----------------------------------------------------------------------------
-- CUSTOMERS - Specific fixes
-- ----------------------------------------------------------------------------
-- ID 253: É
UPDATE customers
SET name = 'SOLUZÉ CIVIL ENGINEERS Trading Under SOLUZE PTY LTD'
WHERE id = 253;
-- ID 1006: Smart apostrophe
UPDATE customers
SET name = 'Dr. Prem''s Molecules Private Limited (DPMolecules)'
WHERE id = 1006;
-- ID 1387: Smart quotes
UPDATE customers
SET name = 'DEE ENTERPRISES (QLD) PTY LTD trading as "MEKLEK"'
WHERE id = 1387;
-- ID 1608: Smart apostrophe
UPDATE customers
SET name = 'Guidera O''Connor Pty Ltd'
WHERE id = 1608;
-- ID 2174: Zero-width space
UPDATE customers
SET name = 'Ingredion ANZ Pty Ltd'
WHERE id = 2174;
-- ID 2215: French é
UPDATE customers
SET name = 'Vale Nouvelle-Calédonie S.A.S'
WHERE id = 2215;
-- ID 2375: Spanish ó
UPDATE customers
SET name = 'Evaluación y Control Ambiental S.A.S.'
WHERE id = 2375;
-- ID 3143: Portuguese ç and ó
UPDATE customers
SET name = 'Zontahevy Comércio e Serviços Offshore Ltda'
WHERE id = 3143;
-- ID 3529: French é
UPDATE customers
SET name = 'Société des Mines de Syama'
WHERE id = 3529;
-- ID 3633: Special space (em space)
UPDATE customers
SET name = 'P.J. Berriman & Co PTY LTD'
WHERE id = 3633;
-- ID 4325: Danish ø
UPDATE customers
SET name = 'Rambøll Danmark'
WHERE id = 4325;
-- ID 4350: Turkish characters İ, Ş, Ğ
UPDATE customers
SET name = 'SONNIVA ENERGY MAKİNA İNŞAAT İTH. İHR. LTD. ŞTİ.'
WHERE id = 4350;
-- ID 4669: French é
UPDATE customers
SET name = 'F.C.C. Fluides Conseils Calédonie SARL'
WHERE id = 4669;
-- ID 4743: Czech ě
UPDATE customers
SET name = 'DGPack Prostějov'
WHERE id = 4743;
-- ID 4764: Zero-width space
UPDATE customers
SET name = 'Mccready Welding Pty Ltd Trading under the entity Mccready''s Welding Services'
WHERE id = 4764;
-- ID 4893: Spanish í
UPDATE customers
SET name = 'Oxiquímica, S.A.P.I. de C.V.'
WHERE id = 4893;
-- ----------------------------------------------------------------------------
-- ADDRESSES - Specific fixes
-- ----------------------------------------------------------------------------
-- ID 19: En dash (711)
UPDATE addresses
SET address = 'Lvl 3, Building B, 711 Talavera Road'
WHERE id = 19;
-- ============================================================================
-- PART 3: Fix PRODUCTS Table (12,536+ corrupted records)
-- ============================================================================
-- Products table has extensive corruption from mojibake and HTML entities
-- Degree symbol: ° → °
UPDATE products SET title = REPLACE(title, '°', '°') WHERE title LIKE '%°%';
UPDATE products SET description = REPLACE(description, '°', '°') WHERE description LIKE '%°%';
UPDATE products SET item_description = REPLACE(item_description, '°', '°') WHERE item_description LIKE '%°%';
-- Registered trademark: ® → ®
UPDATE products SET title = REPLACE(title, '®', '®') WHERE title LIKE '%®%';
UPDATE products SET description = REPLACE(description, '®', '®') WHERE description LIKE '%®%';
UPDATE products SET item_description = REPLACE(item_description, '®', '®') WHERE item_description LIKE '%®%';
-- Trademark: â„¢ → ™
UPDATE products SET title = REPLACE(title, 'â„¢', '') WHERE title LIKE '%â„¢%';
UPDATE products SET description = REPLACE(description, 'â„¢', '') WHERE description LIKE '%â„¢%';
UPDATE products SET item_description = REPLACE(item_description, 'â„¢', '') WHERE item_description LIKE '%â„¢%';
-- Smart quotes and dashes in products
UPDATE products SET title = REPLACE(title, '’', ''') WHERE title LIKE '%â%';
UPDATE products SET description = REPLACE(description, 'â', ''') WHERE description LIKE '%’%';
UPDATE products SET title = REPLACE(title, '“', '"') WHERE title LIKE '%“%';
UPDATE products SET description = REPLACE(description, '“', '"') WHERE description LIKE '%“%';
UPDATE products SET title = REPLACE(title, 'â€', '"') WHERE title LIKE '%â€%';
UPDATE products SET description = REPLACE(description, 'â€', '"') WHERE description LIKE '%â€%';
UPDATE products SET title = REPLACE(title, 'â€"', '') WHERE title LIKE '%â€"%';
UPDATE products SET description = REPLACE(description, 'â€"', '') WHERE description LIKE '%â€"%';
UPDATE products SET title = REPLACE(title, 'â€"', '') WHERE title LIKE '%â€"%';
UPDATE products SET description = REPLACE(description, 'â€"', '') WHERE description LIKE '%â€"%';
-- Accented characters in products
UPDATE products SET title = REPLACE(title, 'é', 'é') WHERE title LIKE '%é%';
UPDATE products SET description = REPLACE(description, 'é', 'é') WHERE description LIKE '%é%';
UPDATE products SET title = REPLACE(title, 'ü', 'ü') WHERE title LIKE '%ü%';
UPDATE products SET description = REPLACE(description, 'ü', 'ü') WHERE description LIKE '%ü%';
-- HTML entities (shouldn't be in database)
UPDATE products SET title = REPLACE(title, '&ndash;', '') WHERE title LIKE '%&ndash;%';
UPDATE products SET description = REPLACE(description, '&ndash;', '') WHERE description LIKE '%&ndash;%';
UPDATE products SET title = REPLACE(title, '&rdquo;', '"') WHERE title LIKE '%&rdquo;%';
UPDATE products SET description = REPLACE(description, '&rdquo;', '"') WHERE description LIKE '%&rdquo;%';
UPDATE products SET title = REPLACE(title, '&ldquo;', '"') WHERE title LIKE '%&ldquo;%';
UPDATE products SET description = REPLACE(description, '&ldquo;', '"') WHERE description LIKE '%&ldquo;%';
UPDATE products SET title = REPLACE(title, '&amp;', '&') WHERE title LIKE '%&amp;%';
UPDATE products SET description = REPLACE(description, '&amp;', '&') WHERE description LIKE '%&amp;%';
UPDATE products SET title = REPLACE(title, '&frac12;', '½') WHERE title LIKE '%&frac12;%';
UPDATE products SET description = REPLACE(description, '&frac12;', '½') WHERE description LIKE '%&frac12;%';
UPDATE products SET title = REPLACE(title, '&deg;', '°') WHERE title LIKE '%&deg;%';
UPDATE products SET description = REPLACE(description, '&deg;', '°') WHERE description LIKE '%&deg;%';
UPDATE products SET title = REPLACE(title, '&nbsp;', ' ') WHERE title LIKE '%&nbsp;%';
UPDATE products SET description = REPLACE(description, '&nbsp;', ' ') WHERE description LIKE '%&nbsp;%';
-- Specific product fixes
UPDATE products SET title = 'C95SN189C DSTGL40/C-Digital temperature probe for P8xx1 Web Sensor. Range -30 to +80°C. With CINCH connector, 1m Cable' WHERE id = 30;
UPDATE products SET title = 'Mid-West Instrument Model 240 SC 02 O(TT)' WHERE id = 38;
UPDATE products SET title = 'Newson Gale Earth-Rite® II FIBC Static Earthing System, GRP Enclosure with 5m 2 Core Spiral Cable and FIBC Clamp' WHERE id = 67;
UPDATE products SET title = 'Newson Gale Earth-Rite® RTR™ Tri-Mode Static Grounding System, Metal Enclosure, X90-IP Heavy Duty Clamp with 10m 2 Core Spiral Cable' WHERE id = 85;
-- ============================================================================
-- PART 4: Verification Queries
-- ============================================================================
-- Run these after the fixes to verify they worked
-- Check fixed principles
SELECT id, name, city FROM principles WHERE id IN (2, 9, 13, 14, 58, 65);
-- Check fixed customers
SELECT id, name FROM customers WHERE id IN (253, 1006, 1387, 1608, 2174, 2215, 2375, 3143, 3529, 4325, 4350, 4669, 4743, 4764, 4893);
-- Check fixed addresses
SELECT id, address FROM addresses WHERE id = 19;
-- Count remaining corrupted records (should be much lower)
SELECT COUNT(*) as remaining_corrupted_principles
FROM principles
WHERE name REGEXP '[^ -~]' OR address REGEXP '[^ -~]' OR city REGEXP '[^ -~]';
SELECT COUNT(*) as remaining_corrupted_customers
FROM customers
WHERE name REGEXP '[^ -~]';
SELECT COUNT(*) as remaining_corrupted_addresses
FROM addresses
WHERE address REGEXP '[^ -~]' OR city REGEXP '[^ -~]';
SELECT COUNT(*) as remaining_corrupted_products
FROM products
WHERE title REGEXP '[^ -~]' OR description REGEXP '[^ -~]' OR item_description REGEXP '[^ -~]';
-- Check sample fixed products
SELECT id, title FROM products WHERE id IN (30, 38, 67, 85);
-- ============================================================================
-- COMPLETION
-- ============================================================================
SELECT 'Fix script completed! Review the verification queries above.' as status;