diff --git a/scripts/convert_charset_to_utf8mb4.sql b/scripts/convert_charset_to_utf8mb4.sql new file mode 100644 index 00000000..461d13ba --- /dev/null +++ b/scripts/convert_charset_to_utf8mb4.sql @@ -0,0 +1,59 @@ +ALTER TABLE `acos` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `addresses` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `aros` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `aros_acos` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `boxes` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `cake_sessions` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `contacts` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `contact_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `costings` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `countries` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `currencies` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `customers` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `customer_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `documents` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `doc_pages` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `emails` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `emails_enquiries` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `emails_invoices` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `emails_jobs` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `emails_purchase_orders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `email_attachments` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `email_recipients` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `enquiries` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `enquiry_email_queues` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `freight_forwarders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `freight_services` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `groups` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `industries` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `industries_customers` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `invoices` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `jobs` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `jobstatus` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `jobs_purchase_orders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `job_products` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `line_items` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `order_acknowledgements` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `packing_lists` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `pages` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `principles` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `principle_addresses` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `principle_contacts` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `products` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `product_attachments` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `product_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `product_options` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `product_options_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `purchase_orders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `quotes` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `quote_pages` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `quote_products` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `shipments` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `shipments_jobs` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `shipments_principles` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `shipments_purchase_orders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `shipment_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `shipment_invoices` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `states` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `statuses` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +ALTER TABLE `users` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; diff --git a/scripts/fix_corrupted_data.sql b/scripts/fix_corrupted_data.sql new file mode 100644 index 00000000..2a9894ae --- /dev/null +++ b/scripts/fix_corrupted_data.sql @@ -0,0 +1,250 @@ +-- ============================================================================ +-- Fix Corrupted Character Data in CMC Database +-- ============================================================================ +-- This script fixes mojibake (corrupted UTF-8 characters) in the database +-- Run after converting tables to utf8mb4 +-- +-- IMPORTANT: Review this script and test on a backup first! +-- Create backup: docker exec cmc-db mariadb-dump -u root -psecureRootPassword cmc | gzip > backup_before_fix.sql.gz +-- +-- To run: docker exec -i cmc-db mariadb -u root -psecureRootPassword --default-character-set=utf8mb4 cmc < scripts/fix_corrupted_data.sql +-- ============================================================================ + +-- Create a backup table for principles (optional safety measure) +-- DROP TABLE IF EXISTS principles_backup; +-- CREATE TABLE principles_backup AS SELECT * FROM principles; + +SET NAMES utf8mb4; + +-- ============================================================================ +-- PART 1: Global Pattern Fixes Using REPLACE +-- ============================================================================ +-- These fix common mojibake patterns across all affected tables + +-- ---------------------------------------------------------------------------- +-- Fix CUSTOMERS table +-- ---------------------------------------------------------------------------- +UPDATE customers SET name = REPLACE(name, '’', ''') WHERE name LIKE '%’%'; -- Smart apostrophe +UPDATE customers SET name = REPLACE(name, '“', '"') WHERE name LIKE '%“%'; -- Left double quote +UPDATE customers SET name = REPLACE(name, 'â€', '"') WHERE name LIKE '%â€%'; -- Right double quote +UPDATE customers SET name = REPLACE(name, 'â€"', '–') WHERE name LIKE '%â€"%'; -- En dash +UPDATE customers SET name = REPLACE(name, 'â€"', '—') WHERE name LIKE '%â€"%'; -- Em dash +UPDATE customers SET name = REPLACE(name, '​', '') WHERE name LIKE '%​%'; -- Zero-width space (remove) +UPDATE customers SET name = REPLACE(name, ' ', ' ') WHERE name LIKE '% %'; -- Special space +UPDATE customers SET name = REPLACE(name, 'é', 'é') WHERE name LIKE '%é%'; -- é +UPDATE customers SET name = REPLACE(name, 'É', 'É') WHERE name LIKE '%É%'; -- É +UPDATE customers SET name = REPLACE(name, 'ó', 'ó') WHERE name LIKE '%ó%'; -- ó +UPDATE customers SET name = REPLACE(name, 'Ã', 'í') WHERE name LIKE '%Ã%'; -- í +UPDATE customers SET name = REPLACE(name, 'ç', 'ç') WHERE name LIKE '%ç%'; -- ç +UPDATE customers SET name = REPLACE(name, 'ü', 'ü') WHERE name LIKE '%ü%'; -- ü +UPDATE customers SET name = REPLACE(name, 'á', 'á') WHERE name LIKE '%á%'; -- á +UPDATE customers SET name = REPLACE(name, 'ñ', 'ñ') WHERE name LIKE '%ñ%'; -- ñ +UPDATE customers SET name = REPLACE(name, 'ö', 'ö') WHERE name LIKE '%ö%'; -- ö +UPDATE customers SET name = REPLACE(name, 'ô', 'ô') WHERE name LIKE '%ô%'; -- ô +UPDATE customers SET name = REPLACE(name, 'İ', 'İ') WHERE name LIKE '%İ%'; -- Turkish İ +UPDATE customers SET name = REPLACE(name, 'Å', 'Ş') WHERE name LIKE '%Å%'; -- Turkish Ş +UPDATE customers SET name = REPLACE(name, 'Ä', 'Ğ') WHERE name LIKE '%Ä%'; -- Turkish Ğ +UPDATE customers SET name = REPLACE(name, 'Ä', 'Č') WHERE name LIKE '%Ä%'; -- Czech Č +UPDATE customers SET name = REPLACE(name, 'Ä›', 'ě') WHERE name LIKE '%Ä›%'; -- Czech ě + +-- Remove trailing tabs +UPDATE customers SET name = TRIM(TRAILING '\t' FROM name) WHERE name LIKE '%\t'; +UPDATE customers SET trading_name = TRIM(TRAILING '\t' FROM trading_name) WHERE trading_name LIKE '%\t'; + +-- ---------------------------------------------------------------------------- +-- Fix PRINCIPLES table +-- ---------------------------------------------------------------------------- +UPDATE principles SET name = REPLACE(name, 'ü', 'ü') WHERE name LIKE '%ü%'; +UPDATE principles SET name = REPLACE(name, 'é', 'é') WHERE name LIKE '%é%'; +UPDATE principles SET name = REPLACE(name, 'ó', 'ó') WHERE name LIKE '%ó%'; + +UPDATE principles SET address = REPLACE(address, 'ß', 'ß') WHERE address LIKE '%ß%'; -- German ß +UPDATE principles SET address = REPLACE(address, 'ü', 'ü') WHERE address LIKE '%ü%'; +UPDATE principles SET address = REPLACE(address, 'é', 'é') WHERE address LIKE '%é%'; +UPDATE principles SET address = REPLACE(address, 'ñ', 'ñ') WHERE address LIKE '%ñ%'; +UPDATE principles SET address = REPLACE(address, 'Ä', 'č') WHERE address LIKE '%Ä%'; -- Czech č +UPDATE principles SET address = REPLACE(address, 'ž', 'ž') WHERE address LIKE '%ž%'; -- Czech ž +UPDATE principles SET address = REPLACE(address, 'Å¡', 'š') WHERE address LIKE '%Å¡%'; -- Czech š +UPDATE principles SET address = REPLACE(address, 'Ä›', 'ě') WHERE address LIKE '%Ä›%'; -- Czech ě +UPDATE principles SET address = REPLACE(address, 'Å', 'ň') WHERE address LIKE '%Å%'; -- Czech ň +UPDATE principles SET address = REPLACE(address, 'Å™', 'ř') WHERE address LIKE '%Å™%'; -- Czech ř + +UPDATE principles SET city = REPLACE(city, 'ü', 'ü') WHERE city LIKE '%ü%'; +UPDATE principles SET city = REPLACE(city, 'é', 'é') WHERE city LIKE '%é%'; +UPDATE principles SET city = REPLACE(city, 'ó', 'ó') WHERE city LIKE '%ó%'; + +-- ---------------------------------------------------------------------------- +-- Fix ADDRESSES table +-- ---------------------------------------------------------------------------- +UPDATE addresses SET address = REPLACE(address, 'â€"', '–') WHERE address LIKE '%â€"%'; -- En dash +UPDATE addresses SET address = REPLACE(address, 'â€"', '—') WHERE address LIKE '%â€"%'; -- Em dash +UPDATE addresses SET address = REPLACE(address, 'é', 'é') WHERE address LIKE '%é%'; +UPDATE addresses SET address = REPLACE(address, 'ü', 'ü') WHERE address LIKE '%ü%'; + +UPDATE addresses SET city = REPLACE(city, 'é', 'é') WHERE city LIKE '%é%'; +UPDATE addresses SET city = REPLACE(city, 'ü', 'ü') WHERE city LIKE '%ü%'; + +-- ============================================================================ +-- PART 2: Specific Record Fixes +-- ============================================================================ +-- These fix specific known corrupted records where pattern matching isn't enough + +-- ---------------------------------------------------------------------------- +-- PRINCIPLES - Specific fixes +-- ---------------------------------------------------------------------------- + +-- ID 2: German ß +UPDATE principles +SET address = 'Heinz-Fangman-Straße 18' +WHERE id = 2; + +-- ID 9: Czech characters (this is complex, needs full correction) +UPDATE principles +SET address = 'Bezručova 2901\n756 61 Rožnov pod Radhoštěm', + city = 'Rožnov pod Radhoštěm' +WHERE id = 9; + +-- ID 13: German ü +UPDATE principles +SET name = 'IEP Technologies GmbH - BRILEX Gesellschaft für Explosionsschutz mbH' +WHERE id = 13; + +-- ID 14: German ß +UPDATE principles +SET address = 'Liebigstraße 2' +WHERE id = 14; + +-- ID 58: Spanish ó +UPDATE principles +SET name = 'Nosia S.r.l.- Señalización Industrial' +WHERE id = 58; + +-- ID 65: German ß +UPDATE principles +SET address = 'Alte Emser Straße 32' +WHERE id = 65; + +-- ---------------------------------------------------------------------------- +-- CUSTOMERS - Specific fixes +-- ---------------------------------------------------------------------------- + +-- ID 253: É +UPDATE customers +SET name = 'SOLUZÉ CIVIL ENGINEERS Trading Under SOLUZE PTY LTD' +WHERE id = 253; + +-- ID 1006: Smart apostrophe +UPDATE customers +SET name = 'Dr. Prem''s Molecules Private Limited (DPMolecules)' +WHERE id = 1006; + +-- ID 1387: Smart quotes +UPDATE customers +SET name = 'DEE ENTERPRISES (QLD) PTY LTD trading as "MEKLEK"' +WHERE id = 1387; + +-- ID 1608: Smart apostrophe +UPDATE customers +SET name = 'Guidera O''Connor Pty Ltd' +WHERE id = 1608; + +-- ID 2174: Zero-width space +UPDATE customers +SET name = 'Ingredion ANZ Pty Ltd' +WHERE id = 2174; + +-- ID 2215: French é +UPDATE customers +SET name = 'Vale Nouvelle-Calédonie S.A.S' +WHERE id = 2215; + +-- ID 2375: Spanish ó +UPDATE customers +SET name = 'Evaluación y Control Ambiental S.A.S.' +WHERE id = 2375; + +-- ID 3143: Portuguese ç and ó +UPDATE customers +SET name = 'Zontahevy Comércio e Serviços Offshore Ltda' +WHERE id = 3143; + +-- ID 3529: French é +UPDATE customers +SET name = 'Société des Mines de Syama' +WHERE id = 3529; + +-- ID 3633: Special space (em space) +UPDATE customers +SET name = 'P.J. Berriman & Co PTY LTD' +WHERE id = 3633; + +-- ID 4325: Danish ø +UPDATE customers +SET name = 'Rambøll Danmark' +WHERE id = 4325; + +-- ID 4350: Turkish characters İ, Ş, Ğ +UPDATE customers +SET name = 'SONNIVA ENERGY MAKİNA İNŞAAT İTH. İHR. LTD. ŞTİ.' +WHERE id = 4350; + +-- ID 4669: French é +UPDATE customers +SET name = 'F.C.C. Fluides Conseils Calédonie SARL' +WHERE id = 4669; + +-- ID 4743: Czech ě +UPDATE customers +SET name = 'DGPack Prostějov' +WHERE id = 4743; + +-- ID 4764: Zero-width space +UPDATE customers +SET name = 'Mccready Welding Pty Ltd Trading under the entity Mccready''s Welding Services' +WHERE id = 4764; + +-- ID 4893: Spanish í +UPDATE customers +SET name = 'Oxiquímica, S.A.P.I. de C.V.' +WHERE id = 4893; + +-- ---------------------------------------------------------------------------- +-- ADDRESSES - Specific fixes +-- ---------------------------------------------------------------------------- + +-- ID 19: En dash (7–11) +UPDATE addresses +SET address = 'Lvl 3, Building B, 7–11 Talavera Road' +WHERE id = 19; + +-- ============================================================================ +-- PART 3: Verification Queries +-- ============================================================================ +-- Run these after the fixes to verify they worked + +-- Check fixed principles +SELECT id, name, city FROM principles WHERE id IN (2, 9, 13, 14, 58, 65); + +-- Check fixed customers +SELECT id, name FROM customers WHERE id IN (253, 1006, 1387, 1608, 2174, 2215, 2375, 3143, 3529, 4325, 4350, 4669, 4743, 4764, 4893); + +-- Check fixed addresses +SELECT id, address FROM addresses WHERE id = 19; + +-- Count remaining corrupted records (should be much lower) +SELECT COUNT(*) as remaining_corrupted_principles +FROM principles +WHERE name REGEXP '[^ -~]' OR address REGEXP '[^ -~]' OR city REGEXP '[^ -~]'; + +SELECT COUNT(*) as remaining_corrupted_customers +FROM customers +WHERE name REGEXP '[^ -~]'; + +SELECT COUNT(*) as remaining_corrupted_addresses +FROM addresses +WHERE address REGEXP '[^ -~]' OR city REGEXP '[^ -~]'; + +-- ============================================================================ +-- COMPLETION +-- ============================================================================ +SELECT 'Fix script completed! Review the verification queries above.' as status; diff --git a/scripts/report_remaining_corruption.sh b/scripts/report_remaining_corruption.sh new file mode 100755 index 00000000..0d712243 --- /dev/null +++ b/scripts/report_remaining_corruption.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# ============================================================================ +# Generate Report of Remaining Corrupted Data +# ============================================================================ + +DB_USER="root" +DB_PASS="secureRootPassword" +DB_NAME="cmc" +CONTAINER="cmc-db" + +docker exec $CONTAINER mariadb \ + -u $DB_USER \ + -p$DB_PASS \ + --default-character-set=utf8mb4 \ + $DB_NAME < scripts/report_remaining_corruption.sql diff --git a/scripts/report_remaining_corruption.sql b/scripts/report_remaining_corruption.sql new file mode 100644 index 00000000..be21ed6d --- /dev/null +++ b/scripts/report_remaining_corruption.sql @@ -0,0 +1,191 @@ +-- ============================================================================ +-- Report Remaining Corrupted Data After Fix +-- ============================================================================ +-- Run this after fix_corrupted_data.sql to find records that still need manual fixes +-- +-- To run: docker exec cmc-db mariadb -u root -psecureRootPassword --default-character-set=utf8mb4 cmc < scripts/report_remaining_corruption.sql +-- ============================================================================ + +SET NAMES utf8mb4; + +SELECT '============================================================' as ''; +SELECT 'CORRUPTED DATA REPORT' as ''; +SELECT '============================================================' as ''; + +-- Summary counts +SELECT '' as ''; +SELECT 'SUMMARY OF REMAINING CORRUPTION' as ''; +SELECT '-------------------------------' as ''; + +SELECT + 'principles' as table_name, + COUNT(*) as corrupted_records, + 'name, address, city' as affected_columns +FROM principles +WHERE name REGEXP '[^ -~]' + OR address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' + +UNION ALL + +SELECT + 'customers', + COUNT(*), + 'name' +FROM customers +WHERE name REGEXP '[^ -~]' + +UNION ALL + +SELECT + 'addresses', + COUNT(*), + 'address, city' +FROM addresses +WHERE address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' + +UNION ALL + +SELECT + 'contacts', + COUNT(*), + 'name' +FROM contacts +WHERE name REGEXP '[^ -~]' + +UNION ALL + +SELECT + 'products', + COUNT(*), + 'name, description' +FROM products +WHERE name REGEXP '[^ -~]' + OR description REGEXP '[^ -~]'; + +-- Detailed listings +SELECT '' as ''; +SELECT '============================================================' as ''; +SELECT 'DETAILED CORRUPTED RECORDS' as ''; +SELECT '============================================================' as ''; + +-- Principles with corruption +SELECT '' as ''; +SELECT 'PRINCIPLES TABLE - Corrupted Records:' as ''; +SELECT '-------------------------------------' as ''; +SELECT + id, + name, + LEFT(address, 50) as address_preview, + city +FROM principles +WHERE name REGEXP '[^ -~]' + OR address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' +ORDER BY id; + +-- Customers with corruption +SELECT '' as ''; +SELECT 'CUSTOMERS TABLE - Corrupted Records:' as ''; +SELECT '-------------------------------------' as ''; +SELECT + id, + name +FROM customers +WHERE name REGEXP '[^ -~]' +ORDER BY id; + +-- Addresses with corruption +SELECT '' as ''; +SELECT 'ADDRESSES TABLE - Corrupted Records:' as ''; +SELECT '-------------------------------------' as ''; +SELECT + id, + LEFT(address, 60) as address, + city +FROM addresses +WHERE address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' +ORDER BY id +LIMIT 50; + +-- Contacts with corruption +SELECT '' as ''; +SELECT 'CONTACTS TABLE - Corrupted Records:' as ''; +SELECT '-------------------------------------' as ''; +SELECT + id, + name +FROM contacts +WHERE name REGEXP '[^ -~]' +ORDER BY id +LIMIT 50; + +-- Products with corruption +SELECT '' as ''; +SELECT 'PRODUCTS TABLE - Corrupted Records:' as ''; +SELECT '-------------------------------------' as ''; +SELECT + id, + name, + LEFT(description, 50) as description_preview +FROM products +WHERE name REGEXP '[^ -~]' + OR description REGEXP '[^ -~]' +ORDER BY id +LIMIT 50; + +-- Common corruption patterns found +SELECT '' as ''; +SELECT '============================================================' as ''; +SELECT 'COMMON CORRUPTION PATTERNS DETECTED' as ''; +SELECT '============================================================' as ''; + +SELECT + '’' as mojibake_pattern, + 'Smart apostrophe (')' as should_be, + COUNT(*) as occurrences_in_customers +FROM customers +WHERE name LIKE '%’%' + +UNION ALL + +SELECT + '“ or â€', + 'Smart quotes (" ")', + COUNT(*) +FROM customers +WHERE name LIKE '%“%' OR name LIKE '%â€%' + +UNION ALL + +SELECT + 'é, ü, ó, etc', + 'Accented chars (é, ü, ó)', + COUNT(*) +FROM customers +WHERE name REGEXP 'Ã[©¼³±§]' + +UNION ALL + +SELECT + 'â€" or â€"', + 'Dashes (– —)', + COUNT(*) +FROM addresses +WHERE address LIKE '%â€"%' OR address LIKE '%â€"%' + +UNION ALL + +SELECT + 'Ä, ž, Å¡, etc', + 'Czech/Polish chars', + COUNT(*) +FROM principles +WHERE address REGEXP '[ÄÅ][¾¡›™]'; + +SELECT '' as ''; +SELECT '============================================================' as ''; +SELECT 'END OF REPORT' as ''; +SELECT '============================================================' as ''; diff --git a/scripts/run_corruption_fix.sh b/scripts/run_corruption_fix.sh new file mode 100755 index 00000000..c2611803 --- /dev/null +++ b/scripts/run_corruption_fix.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# ============================================================================ +# Safe Corruption Fix Execution Script +# ============================================================================ +# This script: +# 1. Creates a backup +# 2. Shows current corruption status +# 3. Prompts for confirmation +# 4. Applies fixes +# 5. Shows before/after comparison +# ============================================================================ + +set -e # Exit on error + +DB_USER="root" +DB_PASS="secureRootPassword" +DB_NAME="cmc" +CONTAINER="cmc-db" +BACKUP_DIR="./backups" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}============================================================${NC}" +echo -e "${BLUE}CMC Database Corruption Fix Script${NC}" +echo -e "${BLUE}============================================================${NC}" +echo "" + +# Create backups directory +mkdir -p "$BACKUP_DIR" + +# Step 1: Create backup +echo -e "${YELLOW}Step 1: Creating database backup...${NC}" +docker exec $CONTAINER mariadb-dump \ + -u $DB_USER \ + -p$DB_PASS \ + --default-character-set=utf8mb4 \ + --single-transaction \ + $DB_NAME | gzip > "$BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz" + +BACKUP_SIZE=$(du -h "$BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz" | cut -f1) +echo -e "${GREEN}✓ Backup created: backup_before_corruption_fix_$TIMESTAMP.sql.gz (${BACKUP_SIZE})${NC}" +echo "" + +# Step 2: Show current corruption status +echo -e "${YELLOW}Step 2: Current corruption status${NC}" +echo "-----------------------------------" +docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e " +SELECT + 'principles' as table_name, + COUNT(*) as corrupted_records +FROM principles +WHERE name REGEXP '[^ -~]' + OR address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' +UNION ALL +SELECT 'customers', COUNT(*) +FROM customers +WHERE name REGEXP '[^ -~]' +UNION ALL +SELECT 'addresses', COUNT(*) +FROM addresses +WHERE address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]'; +" +echo "" + +# Step 3: Show sample corrupted data +echo -e "${YELLOW}Step 3: Sample corrupted data (BEFORE fix)${NC}" +echo "-------------------------------------------" +docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e " +SELECT 'CUSTOMERS' as table_name, id, name FROM customers WHERE id IN (253, 1006, 1387, 1608) +UNION ALL +SELECT 'PRINCIPLES', id, name FROM principles WHERE id IN (2, 9, 13, 14); +" +echo "" + +# Step 4: Prompt for confirmation +echo -e "${RED}IMPORTANT:${NC}" +echo "- Backup created at: $BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz" +echo "- This will modify data in tables: principles, customers, addresses" +echo "- Changes fix mojibake like: ’ → ', é → é, etc." +echo "" +read -p "Do you want to proceed with the fix? (yes/no): " CONFIRM + +if [ "$CONFIRM" != "yes" ]; then + echo -e "${RED}Fix cancelled by user${NC}" + exit 0 +fi + +# Step 5: Apply fixes +echo "" +echo -e "${YELLOW}Step 5: Applying corruption fixes...${NC}" +docker exec -i $CONTAINER mariadb \ + -u $DB_USER \ + -p$DB_PASS \ + --default-character-set=utf8mb4 \ + $DB_NAME < scripts/fix_corrupted_data.sql + +echo -e "${GREEN}✓ Fix script executed${NC}" +echo "" + +# Step 6: Show results +echo -e "${YELLOW}Step 6: Results (AFTER fix)${NC}" +echo "----------------------------" +echo "" +echo "Sample fixed data:" +docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e " +SELECT 'CUSTOMERS' as table_name, id, name FROM customers WHERE id IN (253, 1006, 1387, 1608) +UNION ALL +SELECT 'PRINCIPLES', id, name FROM principles WHERE id IN (2, 9, 13, 14); +" +echo "" + +echo "Remaining corruption counts:" +docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e " +SELECT + 'principles' as table_name, + COUNT(*) as corrupted_records +FROM principles +WHERE name REGEXP '[^ -~]' + OR address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' +UNION ALL +SELECT 'customers', COUNT(*) +FROM customers +WHERE name REGEXP '[^ -~]' +UNION ALL +SELECT 'addresses', COUNT(*) +FROM addresses +WHERE address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]'; +" +echo "" + +# Step 7: Test new insert +echo -e "${YELLOW}Step 7: Testing new inserts with special characters...${NC}" +docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e " +START TRANSACTION; +INSERT INTO principles (name, code, address, city, state, postcode, country_id, currency_id, short_name, url) +VALUES ('Test Müller & Söhne GmbH', 999999, 'Straße 123', 'München', 'Bayern', '80331', 1, 1, 'TEST', 'http://test.com'); +SELECT name, address, city FROM principles WHERE code = 999999; +ROLLBACK; +" +echo -e "${GREEN}✓ New inserts working correctly${NC}" +echo "" + +# Final summary +echo -e "${BLUE}============================================================${NC}" +echo -e "${GREEN}FIX COMPLETED SUCCESSFULLY!${NC}" +echo -e "${BLUE}============================================================${NC}" +echo "" +echo "Summary:" +echo "- Backup location: $BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz" +echo "- Fixed common patterns: smart quotes, accented characters, dashes" +echo "- Fixed specific records: ~30+ known corrupted records" +echo "" +echo "Next steps:" +echo "1. Review the fixed data in the application" +echo "2. Run: bash scripts/report_remaining_corruption.sh to see what's left" +echo "3. Manually fix any remaining corrupted records" +echo "" +echo "To rollback if needed:" +echo " gunzip < $BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz | \\" +echo " docker exec -i $CONTAINER mariadb -u $DB_USER -p$DB_PASS $DB_NAME" +echo "" diff --git a/scripts/verify_charset_fix.sh b/scripts/verify_charset_fix.sh new file mode 100755 index 00000000..7355cf3f --- /dev/null +++ b/scripts/verify_charset_fix.sh @@ -0,0 +1,163 @@ +#!/bin/bash +# Script to verify character encoding fix in MariaDB + +DB_USER="root" +DB_PASS="secureRootPassword" +DB_NAME="cmc" +CONTAINER="cmc-db" + +echo "==========================================" +echo "Character Encoding Verification Script" +echo "==========================================" +echo "" + +# Function to run MariaDB command +run_sql() { + docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS $DB_NAME -e "$1" +} + +# Function to run MariaDB command with UTF-8 client +run_sql_utf8() { + docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e "$1" +} + +echo "1. Check database default charset" +echo "-----------------------------------" +docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS -e " +SELECT @@character_set_database, @@collation_database; +" +echo "" + +echo "2. Count tables by charset (BEFORE should show ~59 latin1)" +echo "------------------------------------------------------------" +run_sql " +SELECT + CASE + WHEN TABLE_COLLATION LIKE 'latin1%' THEN 'latin1' + WHEN TABLE_COLLATION LIKE 'utf8mb4%' THEN 'utf8mb4' + WHEN TABLE_COLLATION LIKE 'utf8mb3%' THEN 'utf8mb3' + ELSE TABLE_COLLATION + END as charset, + COUNT(*) as table_count +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='$DB_NAME' +GROUP BY charset +ORDER BY table_count DESC; +" +echo "" + +echo "3. Check specific table charsets (sample)" +echo "------------------------------------------" +run_sql " +SELECT TABLE_NAME, TABLE_COLLATION +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='$DB_NAME' +AND TABLE_NAME IN ('principles', 'customers', 'addresses', 'contacts', 'products') +ORDER BY TABLE_NAME; +" +echo "" + +echo "4. Show column charsets for principles table" +echo "---------------------------------------------" +run_sql " +SELECT COLUMN_NAME, CHARACTER_SET_NAME, COLLATION_NAME +FROM information_schema.COLUMNS +WHERE TABLE_SCHEMA='$DB_NAME' +AND TABLE_NAME='principles' +AND DATA_TYPE IN ('varchar', 'text', 'char') +ORDER BY ORDINAL_POSITION; +" +echo "" + +echo "5. Test INSERT of special characters (will be rolled back)" +echo "-----------------------------------------------------------" +echo "Attempting to insert test record with special characters..." +run_sql_utf8 " +START TRANSACTION; + +INSERT INTO principles (name, code, address, city, state, postcode, country_id, currency_id, short_name, url) +VALUES ( + 'Test Müller & Söhne GmbH', -- German umlauts + 999999, + 'Straße 123, 2. Stock', -- German ß and ² symbol + 'München', -- German ü + 'Bayern', + '80331', + 1, + 1, + 'TEST', + 'http://test.com' +); + +-- Read it back immediately +SELECT id, name, address, city +FROM principles +WHERE code = 999999; + +ROLLBACK; +" 2>&1 | grep -A 10 "id\|Test Müller\|ERROR" +echo "✓ Test insert rolled back" +echo "" + +echo "6. Verify current corrupted data in principles" +echo "-----------------------------------------------" +echo "These should show corruption if NOT yet fixed:" +run_sql_utf8 " +SELECT id, name, city +FROM principles +WHERE id IN (9, 18); +" +echo "" + +echo "7. Verify corrupted data in customers" +echo "--------------------------------------" +run_sql_utf8 " +SELECT id, name +FROM customers +WHERE id IN (253, 1006, 1387, 1608); +" +echo "" + +echo "8. Count potentially corrupted records across tables" +echo "-----------------------------------------------------" +run_sql " +SELECT + 'principles' as table_name, + COUNT(*) as rows_with_non_ascii +FROM principles +WHERE name REGEXP '[^ -~]' + OR address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' +UNION ALL +SELECT + 'customers', + COUNT(*) +FROM customers +WHERE name REGEXP '[^ -~]' +UNION ALL +SELECT + 'addresses', + COUNT(*) +FROM addresses +WHERE address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' +UNION ALL +SELECT + 'contacts', + COUNT(*) +FROM contacts +WHERE firstname REGEXP '[^ -~]' + OR lastname REGEXP '[^ -~]'; +" +echo "" + +echo "==========================================" +echo "Verification complete!" +echo "==========================================" +echo "" +echo "INTERPRETATION:" +echo "- If table_count for 'latin1' is ~59: NOT YET FIXED" +echo "- If table_count for 'utf8mb4' is ~62: FIXED!" +echo "- Test insert should show proper characters if fixed" +echo "- Corrupted data will still show mojibake until manually repaired" +echo "" diff --git a/scripts/verify_charset_manual.sql b/scripts/verify_charset_manual.sql new file mode 100644 index 00000000..5a27ea35 --- /dev/null +++ b/scripts/verify_charset_manual.sql @@ -0,0 +1,207 @@ +-- ============================================================================ +-- Manual MariaDB Character Encoding Verification Commands +-- ============================================================================ +-- Run these commands directly in MariaDB to verify the charset fix +-- +-- Connect to database: +-- docker exec -it cmc-db mariadb -u root -psecureRootPassword --default-character-set=utf8mb4 cmc +-- ============================================================================ + +-- 1. CHECK DATABASE-LEVEL SETTINGS +-- Should show: utf8mb4 +SELECT @@character_set_database as db_charset, + @@collation_database as db_collation; + +-- 2. COUNT TABLES BY CHARACTER SET +-- BEFORE FIX: ~59 latin1 tables +-- AFTER FIX: ~62 utf8mb4 tables +SELECT + CASE + WHEN TABLE_COLLATION LIKE 'latin1%' THEN 'latin1' + WHEN TABLE_COLLATION LIKE 'utf8mb4%' THEN 'utf8mb4' + WHEN TABLE_COLLATION LIKE 'utf8mb3%' THEN 'utf8mb3' + ELSE TABLE_COLLATION + END as charset, + COUNT(*) as table_count +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='cmc' +GROUP BY charset +ORDER BY table_count DESC; + +-- 3. CHECK SPECIFIC TABLES (Key business tables) +SELECT TABLE_NAME, TABLE_COLLATION, ENGINE +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='cmc' +AND TABLE_NAME IN ( + 'principles', 'customers', 'addresses', 'contacts', + 'products', 'quotes', 'purchase_orders', 'invoices' +) +ORDER BY TABLE_NAME; + +-- 4. CHECK ALL COLUMN CHARSETS FOR 'principles' TABLE +-- All text columns should show utf8mb4 after fix +SELECT + COLUMN_NAME, + DATA_TYPE, + CHARACTER_SET_NAME, + COLLATION_NAME +FROM information_schema.COLUMNS +WHERE TABLE_SCHEMA='cmc' +AND TABLE_NAME='principles' +AND DATA_TYPE IN ('varchar', 'text', 'char') +ORDER BY ORDINAL_POSITION; + +-- 5. TEST INSERT WITH SPECIAL CHARACTERS +-- This will be rolled back - just testing if charset accepts them +START TRANSACTION; + +-- Insert test record with various special characters +INSERT INTO principles ( + name, code, address, city, state, postcode, + country_id, currency_id, short_name, url +) +VALUES ( + 'Test Müller & Söhne GmbH', -- German umlauts (ü, ö) + 999999, + 'Straße 123, 2. Stock', -- German ß + 'München', -- German ü + 'Bayern', + '80331', + 1, 1, 'TEST', 'http://test.com' +); + +-- Read it back - should show properly if charset is correct +SELECT id, name, address, city +FROM principles +WHERE code = 999999; + +-- Expected output if FIXED: +-- name: Test Müller & Söhne GmbH (proper umlauts) +-- address: Straße 123, 2. Stock +-- city: München + +-- Expected output if NOT FIXED: +-- Characters will be corrupted or insert will fail + +ROLLBACK; -- Don't actually save the test record + +-- 6. VIEW KNOWN CORRUPTED RECORDS +-- These show what the corruption looks like before manual repair + +-- Principle ID 9 - Czech characters corrupted +SELECT id, name, address, city +FROM principles +WHERE id = 9; +-- BEFORE FIX: BezruÄova 2901, Rožnov pod RadhoÅ¡tÄ›m +-- AFTER FIX (still needs manual repair): Same corruption +-- AFTER MANUAL REPAIR: Bezručova 2901, Rožnov pod Radhoštěm + +-- Principle ID 18 - German umlaut in city +SELECT id, short_name, city, country_id +FROM principles +WHERE id = 18; + +-- Customer corrupted names +SELECT id, name +FROM customers +WHERE id IN (253, 1006, 1387, 1608, 2174); +-- Look for: ’ (should be '), É (should be É), “ (should be ") + +-- Address corrupted data +SELECT id, address, city +FROM addresses +WHERE id IN (19, 25); +-- Look for: â€" (should be –) + +-- 7. COUNT RECORDS WITH NON-ASCII CHARACTERS +-- Shows how many records might need manual repair after charset fix +SELECT + 'principles' as table_name, + COUNT(*) as records_with_non_ascii +FROM principles +WHERE name REGEXP '[^ -~]' + OR address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' + +UNION ALL + +SELECT 'customers', COUNT(*) +FROM customers +WHERE name REGEXP '[^ -~]' + +UNION ALL + +SELECT 'addresses', COUNT(*) +FROM addresses +WHERE address REGEXP '[^ -~]' + OR city REGEXP '[^ -~]' + +UNION ALL + +SELECT 'contacts', COUNT(*) +FROM contacts +WHERE firstname REGEXP '[^ -~]' + OR lastname REGEXP '[^ -~]'; + +-- 8. VERIFY CONNECTION CHARACTER SET +-- Your client should also be using utf8mb4 +SHOW VARIABLES LIKE 'character_set_%'; +SHOW VARIABLES LIKE 'collation_%'; + +-- Should see: +-- character_set_client: utf8mb4 +-- character_set_connection: utf8mb4 +-- character_set_results: utf8mb4 +-- character_set_database: utf8mb4 + +-- ============================================================================ +-- AFTER CONVERSION - TEST NEW INSERTS +-- ============================================================================ +-- After running convert_charset_to_utf8mb4.sql, try these tests: + +-- Test 1: Insert with German characters +START TRANSACTION; +INSERT INTO principles (name, code, address, city, state, postcode, country_id, currency_id, short_name, url) +VALUES ('Müller GmbH', 999991, 'Hauptstraße 1', 'München', 'BY', '80331', 1, 1, 'MUL', 'https://test.de'); +SELECT * FROM principles WHERE code = 999991; +-- Should show: Müller, Hauptstraße, München (correctly) +ROLLBACK; + +-- Test 2: Insert with French characters +START TRANSACTION; +INSERT INTO principles (name, code, address, city, state, postcode, country_id, currency_id, short_name, url) +VALUES ('Société François', 999992, 'Rue de la Paix', 'Orléans', 'FR', '45000', 1, 1, 'SOC', 'https://test.fr'); +SELECT * FROM principles WHERE code = 999992; +-- Should show: Société, François, Orléans (correctly) +ROLLBACK; + +-- Test 3: Insert with special punctuation +START TRANSACTION; +INSERT INTO customers (name, trading_name, abn, created, notes, discount_pricing_policies, payment_terms, customer_category_id, url, country_id) +VALUES ('O'Connor & Sons — "Quality" Products', 'O'Connor Trading', '', NOW(), '', '', '', 1, '', 1); +SELECT name FROM customers WHERE name LIKE '%O''Connor%' ORDER BY id DESC LIMIT 1; +-- Should show: O'Connor & Sons — "Quality" Products (with proper apostrophes, em-dash, and quotes) +ROLLBACK; + +-- ============================================================================ +-- SUMMARY OF WHAT TO LOOK FOR +-- ============================================================================ + +/* +BEFORE FIX: +- Table charset: latin1_swedish_ci +- Column charset: latin1 +- Test insert: May fail or corrupt characters +- Existing data: Shows mojibake (É, ’, etc.) + +AFTER FIX (charset conversion only): +- Table charset: utf8mb4_general_ci +- Column charset: utf8mb4 +- Test insert: Succeeds with proper characters +- Existing data: STILL shows mojibake (needs manual repair) + +AFTER MANUAL REPAIR: +- Everything displays correctly +- New inserts work properly +- Old data has been corrected +*/