318 lines
16 KiB
SQL
318 lines
16 KiB
SQL
-- ============================================================================
|
||
-- Fix Corrupted Character Data in CMC Database
|
||
-- ============================================================================
|
||
-- This script fixes mojibake (corrupted UTF-8 characters) in the database
|
||
-- Run after converting tables to utf8mb4
|
||
--
|
||
-- IMPORTANT: Review this script and test on a backup first!
|
||
-- Create backup: docker exec cmc-db mariadb-dump -u root -psecureRootPassword cmc | gzip > backup_before_fix.sql.gz
|
||
--
|
||
-- To run: docker exec -i cmc-db mariadb -u root -psecureRootPassword --default-character-set=utf8mb4 cmc < scripts/fix_corrupted_data.sql
|
||
-- ============================================================================
|
||
|
||
-- Create a backup table for principles (optional safety measure)
|
||
-- DROP TABLE IF EXISTS principles_backup;
|
||
-- CREATE TABLE principles_backup AS SELECT * FROM principles;
|
||
|
||
SET NAMES utf8mb4;
|
||
|
||
-- ============================================================================
|
||
-- PART 1: Global Pattern Fixes Using REPLACE
|
||
-- ============================================================================
|
||
-- These fix common mojibake patterns across all affected tables
|
||
|
||
-- ----------------------------------------------------------------------------
|
||
-- Fix CUSTOMERS table
|
||
-- ----------------------------------------------------------------------------
|
||
UPDATE customers SET name = REPLACE(name, '’', ''') WHERE name LIKE '%’%'; -- Smart apostrophe
|
||
UPDATE customers SET name = REPLACE(name, '“', '"') WHERE name LIKE '%“%'; -- Left double quote
|
||
UPDATE customers SET name = REPLACE(name, 'â€', '"') WHERE name LIKE '%â€%'; -- Right double quote
|
||
UPDATE customers SET name = REPLACE(name, 'â€"', '–') WHERE name LIKE '%â€"%'; -- En dash
|
||
UPDATE customers SET name = REPLACE(name, 'â€"', '—') WHERE name LIKE '%â€"%'; -- Em dash
|
||
UPDATE customers SET name = REPLACE(name, '​', '') WHERE name LIKE '%​%'; -- Zero-width space (remove)
|
||
UPDATE customers SET name = REPLACE(name, ' ', ' ') WHERE name LIKE '% %'; -- Special space
|
||
UPDATE customers SET name = REPLACE(name, 'é', 'é') WHERE name LIKE '%é%'; -- é
|
||
UPDATE customers SET name = REPLACE(name, 'É', 'É') WHERE name LIKE '%É%'; -- É
|
||
UPDATE customers SET name = REPLACE(name, 'ó', 'ó') WHERE name LIKE '%ó%'; -- ó
|
||
UPDATE customers SET name = REPLACE(name, 'Ã', 'í') WHERE name LIKE '%Ã%'; -- í
|
||
UPDATE customers SET name = REPLACE(name, 'ç', 'ç') WHERE name LIKE '%ç%'; -- ç
|
||
UPDATE customers SET name = REPLACE(name, 'ü', 'ü') WHERE name LIKE '%ü%'; -- ü
|
||
UPDATE customers SET name = REPLACE(name, 'á', 'á') WHERE name LIKE '%á%'; -- á
|
||
UPDATE customers SET name = REPLACE(name, 'ñ', 'ñ') WHERE name LIKE '%ñ%'; -- ñ
|
||
UPDATE customers SET name = REPLACE(name, 'ö', 'ö') WHERE name LIKE '%ö%'; -- ö
|
||
UPDATE customers SET name = REPLACE(name, 'ô', 'ô') WHERE name LIKE '%ô%'; -- ô
|
||
UPDATE customers SET name = REPLACE(name, 'İ', 'İ') WHERE name LIKE '%İ%'; -- Turkish İ
|
||
UPDATE customers SET name = REPLACE(name, 'Å', 'Ş') WHERE name LIKE '%Å%'; -- Turkish Ş
|
||
UPDATE customers SET name = REPLACE(name, 'Ä', 'Ğ') WHERE name LIKE '%Ä%'; -- Turkish Ğ
|
||
UPDATE customers SET name = REPLACE(name, 'Ä', 'Č') WHERE name LIKE '%Ä%'; -- Czech Č
|
||
UPDATE customers SET name = REPLACE(name, 'Ä›', 'ě') WHERE name LIKE '%Ä›%'; -- Czech ě
|
||
|
||
-- Remove trailing tabs
|
||
UPDATE customers SET name = TRIM(TRAILING '\t' FROM name) WHERE name LIKE '%\t';
|
||
UPDATE customers SET trading_name = TRIM(TRAILING '\t' FROM trading_name) WHERE trading_name LIKE '%\t';
|
||
|
||
-- ----------------------------------------------------------------------------
|
||
-- Fix PRINCIPLES table
|
||
-- ----------------------------------------------------------------------------
|
||
UPDATE principles SET name = REPLACE(name, 'ü', 'ü') WHERE name LIKE '%ü%';
|
||
UPDATE principles SET name = REPLACE(name, 'é', 'é') WHERE name LIKE '%é%';
|
||
UPDATE principles SET name = REPLACE(name, 'ó', 'ó') WHERE name LIKE '%ó%';
|
||
|
||
UPDATE principles SET address = REPLACE(address, 'ß', 'ß') WHERE address LIKE '%ß%'; -- German ß
|
||
UPDATE principles SET address = REPLACE(address, 'ü', 'ü') WHERE address LIKE '%ü%';
|
||
UPDATE principles SET address = REPLACE(address, 'é', 'é') WHERE address LIKE '%é%';
|
||
UPDATE principles SET address = REPLACE(address, 'ñ', 'ñ') WHERE address LIKE '%ñ%';
|
||
UPDATE principles SET address = REPLACE(address, 'Ä', 'č') WHERE address LIKE '%Ä%'; -- Czech č
|
||
UPDATE principles SET address = REPLACE(address, 'ž', 'ž') WHERE address LIKE '%ž%'; -- Czech ž
|
||
UPDATE principles SET address = REPLACE(address, 'Å¡', 'š') WHERE address LIKE '%Å¡%'; -- Czech š
|
||
UPDATE principles SET address = REPLACE(address, 'Ä›', 'ě') WHERE address LIKE '%Ä›%'; -- Czech ě
|
||
UPDATE principles SET address = REPLACE(address, 'Å', 'ň') WHERE address LIKE '%Å%'; -- Czech ň
|
||
UPDATE principles SET address = REPLACE(address, 'Å™', 'ř') WHERE address LIKE '%Å™%'; -- Czech ř
|
||
|
||
UPDATE principles SET city = REPLACE(city, 'ü', 'ü') WHERE city LIKE '%ü%';
|
||
UPDATE principles SET city = REPLACE(city, 'é', 'é') WHERE city LIKE '%é%';
|
||
UPDATE principles SET city = REPLACE(city, 'ó', 'ó') WHERE city LIKE '%ó%';
|
||
|
||
-- ----------------------------------------------------------------------------
|
||
-- Fix ADDRESSES table
|
||
-- ----------------------------------------------------------------------------
|
||
UPDATE addresses SET address = REPLACE(address, 'â€"', '–') WHERE address LIKE '%â€"%'; -- En dash
|
||
UPDATE addresses SET address = REPLACE(address, 'â€"', '—') WHERE address LIKE '%â€"%'; -- Em dash
|
||
UPDATE addresses SET address = REPLACE(address, 'é', 'é') WHERE address LIKE '%é%';
|
||
UPDATE addresses SET address = REPLACE(address, 'ü', 'ü') WHERE address LIKE '%ü%';
|
||
|
||
UPDATE addresses SET city = REPLACE(city, 'é', 'é') WHERE city LIKE '%é%';
|
||
UPDATE addresses SET city = REPLACE(city, 'ü', 'ü') WHERE city LIKE '%ü%';
|
||
|
||
-- ============================================================================
|
||
-- PART 2: Specific Record Fixes
|
||
-- ============================================================================
|
||
-- These fix specific known corrupted records where pattern matching isn't enough
|
||
|
||
-- ----------------------------------------------------------------------------
|
||
-- PRINCIPLES - Specific fixes
|
||
-- ----------------------------------------------------------------------------
|
||
|
||
-- ID 2: German ß
|
||
UPDATE principles
|
||
SET address = 'Heinz-Fangman-Straße 18'
|
||
WHERE id = 2;
|
||
|
||
-- ID 9: Czech characters (this is complex, needs full correction)
|
||
UPDATE principles
|
||
SET address = 'Bezručova 2901\n756 61 Rožnov pod Radhoštěm',
|
||
city = 'Rožnov pod Radhoštěm'
|
||
WHERE id = 9;
|
||
|
||
-- ID 13: German ü
|
||
UPDATE principles
|
||
SET name = 'IEP Technologies GmbH - BRILEX Gesellschaft für Explosionsschutz mbH'
|
||
WHERE id = 13;
|
||
|
||
-- ID 14: German ß
|
||
UPDATE principles
|
||
SET address = 'Liebigstraße 2'
|
||
WHERE id = 14;
|
||
|
||
-- ID 58: Spanish ó
|
||
UPDATE principles
|
||
SET name = 'Nosia S.r.l.- Señalización Industrial'
|
||
WHERE id = 58;
|
||
|
||
-- ID 65: German ß
|
||
UPDATE principles
|
||
SET address = 'Alte Emser Straße 32'
|
||
WHERE id = 65;
|
||
|
||
-- ----------------------------------------------------------------------------
|
||
-- CUSTOMERS - Specific fixes
|
||
-- ----------------------------------------------------------------------------
|
||
|
||
-- ID 253: É
|
||
UPDATE customers
|
||
SET name = 'SOLUZÉ CIVIL ENGINEERS Trading Under SOLUZE PTY LTD'
|
||
WHERE id = 253;
|
||
|
||
-- ID 1006: Smart apostrophe
|
||
UPDATE customers
|
||
SET name = 'Dr. Prem''s Molecules Private Limited (DPMolecules)'
|
||
WHERE id = 1006;
|
||
|
||
-- ID 1387: Smart quotes
|
||
UPDATE customers
|
||
SET name = 'DEE ENTERPRISES (QLD) PTY LTD trading as "MEKLEK"'
|
||
WHERE id = 1387;
|
||
|
||
-- ID 1608: Smart apostrophe
|
||
UPDATE customers
|
||
SET name = 'Guidera O''Connor Pty Ltd'
|
||
WHERE id = 1608;
|
||
|
||
-- ID 2174: Zero-width space
|
||
UPDATE customers
|
||
SET name = 'Ingredion ANZ Pty Ltd'
|
||
WHERE id = 2174;
|
||
|
||
-- ID 2215: French é
|
||
UPDATE customers
|
||
SET name = 'Vale Nouvelle-Calédonie S.A.S'
|
||
WHERE id = 2215;
|
||
|
||
-- ID 2375: Spanish ó
|
||
UPDATE customers
|
||
SET name = 'Evaluación y Control Ambiental S.A.S.'
|
||
WHERE id = 2375;
|
||
|
||
-- ID 3143: Portuguese ç and ó
|
||
UPDATE customers
|
||
SET name = 'Zontahevy Comércio e Serviços Offshore Ltda'
|
||
WHERE id = 3143;
|
||
|
||
-- ID 3529: French é
|
||
UPDATE customers
|
||
SET name = 'Société des Mines de Syama'
|
||
WHERE id = 3529;
|
||
|
||
-- ID 3633: Special space (em space)
|
||
UPDATE customers
|
||
SET name = 'P.J. Berriman & Co PTY LTD'
|
||
WHERE id = 3633;
|
||
|
||
-- ID 4325: Danish ø
|
||
UPDATE customers
|
||
SET name = 'Rambøll Danmark'
|
||
WHERE id = 4325;
|
||
|
||
-- ID 4350: Turkish characters İ, Ş, Ğ
|
||
UPDATE customers
|
||
SET name = 'SONNIVA ENERGY MAKİNA İNŞAAT İTH. İHR. LTD. ŞTİ.'
|
||
WHERE id = 4350;
|
||
|
||
-- ID 4669: French é
|
||
UPDATE customers
|
||
SET name = 'F.C.C. Fluides Conseils Calédonie SARL'
|
||
WHERE id = 4669;
|
||
|
||
-- ID 4743: Czech ě
|
||
UPDATE customers
|
||
SET name = 'DGPack Prostějov'
|
||
WHERE id = 4743;
|
||
|
||
-- ID 4764: Zero-width space
|
||
UPDATE customers
|
||
SET name = 'Mccready Welding Pty Ltd Trading under the entity Mccready''s Welding Services'
|
||
WHERE id = 4764;
|
||
|
||
-- ID 4893: Spanish í
|
||
UPDATE customers
|
||
SET name = 'Oxiquímica, S.A.P.I. de C.V.'
|
||
WHERE id = 4893;
|
||
|
||
-- ----------------------------------------------------------------------------
|
||
-- ADDRESSES - Specific fixes
|
||
-- ----------------------------------------------------------------------------
|
||
|
||
-- ID 19: En dash (7–11)
|
||
UPDATE addresses
|
||
SET address = 'Lvl 3, Building B, 7–11 Talavera Road'
|
||
WHERE id = 19;
|
||
|
||
-- ============================================================================
|
||
-- PART 3: Fix PRODUCTS Table (12,536+ corrupted records)
|
||
-- ============================================================================
|
||
-- Products table has extensive corruption from mojibake and HTML entities
|
||
|
||
-- Degree symbol: ° → °
|
||
UPDATE products SET title = REPLACE(title, '°', '°') WHERE title LIKE '%°%';
|
||
UPDATE products SET description = REPLACE(description, '°', '°') WHERE description LIKE '%°%';
|
||
UPDATE products SET item_description = REPLACE(item_description, '°', '°') WHERE item_description LIKE '%°%';
|
||
|
||
-- Registered trademark: ® → ®
|
||
UPDATE products SET title = REPLACE(title, '®', '®') WHERE title LIKE '%®%';
|
||
UPDATE products SET description = REPLACE(description, '®', '®') WHERE description LIKE '%®%';
|
||
UPDATE products SET item_description = REPLACE(item_description, '®', '®') WHERE item_description LIKE '%®%';
|
||
|
||
-- Trademark: â„¢ → ™
|
||
UPDATE products SET title = REPLACE(title, 'â„¢', '™') WHERE title LIKE '%â„¢%';
|
||
UPDATE products SET description = REPLACE(description, 'â„¢', '™') WHERE description LIKE '%â„¢%';
|
||
UPDATE products SET item_description = REPLACE(item_description, 'â„¢', '™') WHERE item_description LIKE '%â„¢%';
|
||
|
||
-- Smart quotes and dashes in products
|
||
UPDATE products SET title = REPLACE(title, '’', ''') WHERE title LIKE '%’%';
|
||
UPDATE products SET description = REPLACE(description, '’', ''') WHERE description LIKE '%’%';
|
||
UPDATE products SET title = REPLACE(title, '“', '"') WHERE title LIKE '%“%';
|
||
UPDATE products SET description = REPLACE(description, '“', '"') WHERE description LIKE '%“%';
|
||
UPDATE products SET title = REPLACE(title, 'â€', '"') WHERE title LIKE '%â€%';
|
||
UPDATE products SET description = REPLACE(description, 'â€', '"') WHERE description LIKE '%â€%';
|
||
UPDATE products SET title = REPLACE(title, 'â€"', '–') WHERE title LIKE '%â€"%';
|
||
UPDATE products SET description = REPLACE(description, 'â€"', '–') WHERE description LIKE '%â€"%';
|
||
UPDATE products SET title = REPLACE(title, 'â€"', '—') WHERE title LIKE '%â€"%';
|
||
UPDATE products SET description = REPLACE(description, 'â€"', '—') WHERE description LIKE '%â€"%';
|
||
|
||
-- Accented characters in products
|
||
UPDATE products SET title = REPLACE(title, 'é', 'é') WHERE title LIKE '%é%';
|
||
UPDATE products SET description = REPLACE(description, 'é', 'é') WHERE description LIKE '%é%';
|
||
UPDATE products SET title = REPLACE(title, 'ü', 'ü') WHERE title LIKE '%ü%';
|
||
UPDATE products SET description = REPLACE(description, 'ü', 'ü') WHERE description LIKE '%ü%';
|
||
|
||
-- HTML entities (shouldn't be in database)
|
||
UPDATE products SET title = REPLACE(title, '–', '–') WHERE title LIKE '%–%';
|
||
UPDATE products SET description = REPLACE(description, '–', '–') WHERE description LIKE '%–%';
|
||
UPDATE products SET title = REPLACE(title, '”', '"') WHERE title LIKE '%”%';
|
||
UPDATE products SET description = REPLACE(description, '”', '"') WHERE description LIKE '%”%';
|
||
UPDATE products SET title = REPLACE(title, '“', '"') WHERE title LIKE '%“%';
|
||
UPDATE products SET description = REPLACE(description, '“', '"') WHERE description LIKE '%“%';
|
||
UPDATE products SET title = REPLACE(title, '&', '&') WHERE title LIKE '%&%';
|
||
UPDATE products SET description = REPLACE(description, '&', '&') WHERE description LIKE '%&%';
|
||
UPDATE products SET title = REPLACE(title, '½', '½') WHERE title LIKE '%½%';
|
||
UPDATE products SET description = REPLACE(description, '½', '½') WHERE description LIKE '%½%';
|
||
UPDATE products SET title = REPLACE(title, '°', '°') WHERE title LIKE '%°%';
|
||
UPDATE products SET description = REPLACE(description, '°', '°') WHERE description LIKE '%°%';
|
||
UPDATE products SET title = REPLACE(title, ' ', ' ') WHERE title LIKE '% %';
|
||
UPDATE products SET description = REPLACE(description, ' ', ' ') WHERE description LIKE '% %';
|
||
|
||
-- Specific product fixes
|
||
UPDATE products SET title = 'C95SN189C DSTGL40/C-Digital temperature probe for P8xx1 Web Sensor. Range -30 to +80°C. With CINCH connector, 1m Cable' WHERE id = 30;
|
||
UPDATE products SET title = 'Mid-West Instrument Model 240 – SC – 02 – O(TT)' WHERE id = 38;
|
||
UPDATE products SET title = 'Newson Gale Earth-Rite® II FIBC Static Earthing System, GRP Enclosure with 5m 2 Core Spiral Cable and FIBC Clamp' WHERE id = 67;
|
||
UPDATE products SET title = 'Newson Gale Earth-Rite® RTR™ Tri-Mode Static Grounding System, Metal Enclosure, X90-IP Heavy Duty Clamp with 10m 2 Core Spiral Cable' WHERE id = 85;
|
||
|
||
-- ============================================================================
|
||
-- PART 4: Verification Queries
|
||
-- ============================================================================
|
||
-- Run these after the fixes to verify they worked
|
||
|
||
-- Check fixed principles
|
||
SELECT id, name, city FROM principles WHERE id IN (2, 9, 13, 14, 58, 65);
|
||
|
||
-- Check fixed customers
|
||
SELECT id, name FROM customers WHERE id IN (253, 1006, 1387, 1608, 2174, 2215, 2375, 3143, 3529, 4325, 4350, 4669, 4743, 4764, 4893);
|
||
|
||
-- Check fixed addresses
|
||
SELECT id, address FROM addresses WHERE id = 19;
|
||
|
||
-- Count remaining corrupted records (should be much lower)
|
||
SELECT COUNT(*) as remaining_corrupted_principles
|
||
FROM principles
|
||
WHERE name REGEXP '[^ -~]' OR address REGEXP '[^ -~]' OR city REGEXP '[^ -~]';
|
||
|
||
SELECT COUNT(*) as remaining_corrupted_customers
|
||
FROM customers
|
||
WHERE name REGEXP '[^ -~]';
|
||
|
||
SELECT COUNT(*) as remaining_corrupted_addresses
|
||
FROM addresses
|
||
WHERE address REGEXP '[^ -~]' OR city REGEXP '[^ -~]';
|
||
|
||
SELECT COUNT(*) as remaining_corrupted_products
|
||
FROM products
|
||
WHERE title REGEXP '[^ -~]' OR description REGEXP '[^ -~]' OR item_description REGEXP '[^ -~]';
|
||
|
||
-- Check sample fixed products
|
||
SELECT id, title FROM products WHERE id IN (30, 38, 67, 85);
|
||
|
||
-- ============================================================================
|
||
-- COMPLETION
|
||
-- ============================================================================
|
||
SELECT 'Fix script completed! Review the verification queries above.' as status;
|