Scripts to fix encoding problems #137

This commit is contained in:
Karl Cordes 2025-12-10 07:23:22 +11:00
parent 5db8451f2c
commit bc958b6acf
7 changed files with 1056 additions and 0 deletions

View file

@ -0,0 +1,59 @@
ALTER TABLE `acos` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `addresses` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `aros` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `aros_acos` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `boxes` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `cake_sessions` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `contacts` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `contact_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `costings` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `countries` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `currencies` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `customers` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `customer_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `documents` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `doc_pages` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `emails` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `emails_enquiries` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `emails_invoices` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `emails_jobs` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `emails_purchase_orders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `email_attachments` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `email_recipients` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `enquiries` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `enquiry_email_queues` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `freight_forwarders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `freight_services` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `groups` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `industries` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `industries_customers` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `invoices` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `jobs` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `jobstatus` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `jobs_purchase_orders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `job_products` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `line_items` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `order_acknowledgements` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `packing_lists` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `pages` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `principles` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `principle_addresses` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `principle_contacts` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `products` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `product_attachments` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `product_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `product_options` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `product_options_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `purchase_orders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `quotes` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `quote_pages` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `quote_products` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `shipments` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `shipments_jobs` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `shipments_principles` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `shipments_purchase_orders` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `shipment_categories` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `shipment_invoices` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `states` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `statuses` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
ALTER TABLE `users` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;

View file

@ -0,0 +1,250 @@
-- ============================================================================
-- Fix Corrupted Character Data in CMC Database
-- ============================================================================
-- This script fixes mojibake (corrupted UTF-8 characters) in the database
-- Run after converting tables to utf8mb4
--
-- IMPORTANT: Review this script and test on a backup first!
-- Create backup: docker exec cmc-db mariadb-dump -u root -psecureRootPassword cmc | gzip > backup_before_fix.sql.gz
--
-- To run: docker exec -i cmc-db mariadb -u root -psecureRootPassword --default-character-set=utf8mb4 cmc < scripts/fix_corrupted_data.sql
-- ============================================================================
-- Create a backup table for principles (optional safety measure)
-- DROP TABLE IF EXISTS principles_backup;
-- CREATE TABLE principles_backup AS SELECT * FROM principles;
SET NAMES utf8mb4;
-- ============================================================================
-- PART 1: Global Pattern Fixes Using REPLACE
-- ============================================================================
-- These fix common mojibake patterns across all affected tables
-- ----------------------------------------------------------------------------
-- Fix CUSTOMERS table
-- ----------------------------------------------------------------------------
UPDATE customers SET name = REPLACE(name, '’', ''') WHERE name LIKE '%â%'; -- Smart apostrophe
UPDATE customers SET name = REPLACE(name, '“', '"') WHERE name LIKE '%“%'; -- Left double quote
UPDATE customers SET name = REPLACE(name, 'â€', '"') WHERE name LIKE '%â€%'; -- Right double quote
UPDATE customers SET name = REPLACE(name, 'â€"', '') WHERE name LIKE '%â€"%'; -- En dash
UPDATE customers SET name = REPLACE(name, 'â€"', '') WHERE name LIKE '%â€"%'; -- Em dash
UPDATE customers SET name = REPLACE(name, '​', '') WHERE name LIKE '%​%'; -- Zero-width space (remove)
UPDATE customers SET name = REPLACE(name, ' ', ' ') WHERE name LIKE '% %'; -- Special space
UPDATE customers SET name = REPLACE(name, 'é', 'é') WHERE name LIKE '%é%'; -- é
UPDATE customers SET name = REPLACE(name, 'É', 'É') WHERE name LIKE '%É%'; -- É
UPDATE customers SET name = REPLACE(name, 'ó', 'ó') WHERE name LIKE '%ó%'; -- ó
UPDATE customers SET name = REPLACE(name, 'Ã', 'í') WHERE name LIKE '%Ã%'; -- í
UPDATE customers SET name = REPLACE(name, 'ç', 'ç') WHERE name LIKE '%ç%'; -- ç
UPDATE customers SET name = REPLACE(name, 'ü', 'ü') WHERE name LIKE '%ü%'; -- ü
UPDATE customers SET name = REPLACE(name, 'á', 'á') WHERE name LIKE '%á%'; -- á
UPDATE customers SET name = REPLACE(name, 'ñ', 'ñ') WHERE name LIKE '%ñ%'; -- ñ
UPDATE customers SET name = REPLACE(name, 'ö', 'ö') WHERE name LIKE '%ö%'; -- ö
UPDATE customers SET name = REPLACE(name, 'ô', 'ô') WHERE name LIKE '%ô%'; -- ô
UPDATE customers SET name = REPLACE(name, 'İ', 'İ') WHERE name LIKE '%İ%'; -- Turkish İ
UPDATE customers SET name = REPLACE(name, 'Å', 'Ş') WHERE name LIKE '%Å%'; -- Turkish Ş
UPDATE customers SET name = REPLACE(name, 'Ä', 'Ğ') WHERE name LIKE '%Ä%'; -- Turkish Ğ
UPDATE customers SET name = REPLACE(name, 'Ä', 'Č') WHERE name LIKE '%Ä%'; -- Czech Č
UPDATE customers SET name = REPLACE(name, 'Ä›', 'ě') WHERE name LIKE '%Ä›%'; -- Czech ě
-- Remove trailing tabs
UPDATE customers SET name = TRIM(TRAILING '\t' FROM name) WHERE name LIKE '%\t';
UPDATE customers SET trading_name = TRIM(TRAILING '\t' FROM trading_name) WHERE trading_name LIKE '%\t';
-- ----------------------------------------------------------------------------
-- Fix PRINCIPLES table
-- ----------------------------------------------------------------------------
UPDATE principles SET name = REPLACE(name, 'ü', 'ü') WHERE name LIKE '%ü%';
UPDATE principles SET name = REPLACE(name, 'é', 'é') WHERE name LIKE '%é%';
UPDATE principles SET name = REPLACE(name, 'ó', 'ó') WHERE name LIKE '%ó%';
UPDATE principles SET address = REPLACE(address, 'ß', 'ß') WHERE address LIKE '%ß%'; -- German ß
UPDATE principles SET address = REPLACE(address, 'ü', 'ü') WHERE address LIKE '%ü%';
UPDATE principles SET address = REPLACE(address, 'é', 'é') WHERE address LIKE '%é%';
UPDATE principles SET address = REPLACE(address, 'ñ', 'ñ') WHERE address LIKE '%ñ%';
UPDATE principles SET address = REPLACE(address, 'Ä', 'č') WHERE address LIKE '%Ä%'; -- Czech č
UPDATE principles SET address = REPLACE(address, 'ž', 'ž') WHERE address LIKE '%ž%'; -- Czech ž
UPDATE principles SET address = REPLACE(address, 'Å¡', 'š') WHERE address LIKE '%Å¡%'; -- Czech š
UPDATE principles SET address = REPLACE(address, 'Ä›', 'ě') WHERE address LIKE '%Ä›%'; -- Czech ě
UPDATE principles SET address = REPLACE(address, 'Å', 'ň') WHERE address LIKE '%Å%'; -- Czech ň
UPDATE principles SET address = REPLACE(address, 'Å™', 'ř') WHERE address LIKE '%Å™%'; -- Czech ř
UPDATE principles SET city = REPLACE(city, 'ü', 'ü') WHERE city LIKE '%ü%';
UPDATE principles SET city = REPLACE(city, 'é', 'é') WHERE city LIKE '%é%';
UPDATE principles SET city = REPLACE(city, 'ó', 'ó') WHERE city LIKE '%ó%';
-- ----------------------------------------------------------------------------
-- Fix ADDRESSES table
-- ----------------------------------------------------------------------------
UPDATE addresses SET address = REPLACE(address, 'â€"', '') WHERE address LIKE '%â€"%'; -- En dash
UPDATE addresses SET address = REPLACE(address, 'â€"', '') WHERE address LIKE '%â€"%'; -- Em dash
UPDATE addresses SET address = REPLACE(address, 'é', 'é') WHERE address LIKE '%é%';
UPDATE addresses SET address = REPLACE(address, 'ü', 'ü') WHERE address LIKE '%ü%';
UPDATE addresses SET city = REPLACE(city, 'é', 'é') WHERE city LIKE '%é%';
UPDATE addresses SET city = REPLACE(city, 'ü', 'ü') WHERE city LIKE '%ü%';
-- ============================================================================
-- PART 2: Specific Record Fixes
-- ============================================================================
-- These fix specific known corrupted records where pattern matching isn't enough
-- ----------------------------------------------------------------------------
-- PRINCIPLES - Specific fixes
-- ----------------------------------------------------------------------------
-- ID 2: German ß
UPDATE principles
SET address = 'Heinz-Fangman-Straße 18'
WHERE id = 2;
-- ID 9: Czech characters (this is complex, needs full correction)
UPDATE principles
SET address = 'Bezručova 2901\n756 61 Rožnov pod Radhoštěm',
city = 'Rožnov pod Radhoštěm'
WHERE id = 9;
-- ID 13: German ü
UPDATE principles
SET name = 'IEP Technologies GmbH - BRILEX Gesellschaft für Explosionsschutz mbH'
WHERE id = 13;
-- ID 14: German ß
UPDATE principles
SET address = 'Liebigstraße 2'
WHERE id = 14;
-- ID 58: Spanish ó
UPDATE principles
SET name = 'Nosia S.r.l.- Señalización Industrial'
WHERE id = 58;
-- ID 65: German ß
UPDATE principles
SET address = 'Alte Emser Straße 32'
WHERE id = 65;
-- ----------------------------------------------------------------------------
-- CUSTOMERS - Specific fixes
-- ----------------------------------------------------------------------------
-- ID 253: É
UPDATE customers
SET name = 'SOLUZÉ CIVIL ENGINEERS Trading Under SOLUZE PTY LTD'
WHERE id = 253;
-- ID 1006: Smart apostrophe
UPDATE customers
SET name = 'Dr. Prem''s Molecules Private Limited (DPMolecules)'
WHERE id = 1006;
-- ID 1387: Smart quotes
UPDATE customers
SET name = 'DEE ENTERPRISES (QLD) PTY LTD trading as "MEKLEK"'
WHERE id = 1387;
-- ID 1608: Smart apostrophe
UPDATE customers
SET name = 'Guidera O''Connor Pty Ltd'
WHERE id = 1608;
-- ID 2174: Zero-width space
UPDATE customers
SET name = 'Ingredion ANZ Pty Ltd'
WHERE id = 2174;
-- ID 2215: French é
UPDATE customers
SET name = 'Vale Nouvelle-Calédonie S.A.S'
WHERE id = 2215;
-- ID 2375: Spanish ó
UPDATE customers
SET name = 'Evaluación y Control Ambiental S.A.S.'
WHERE id = 2375;
-- ID 3143: Portuguese ç and ó
UPDATE customers
SET name = 'Zontahevy Comércio e Serviços Offshore Ltda'
WHERE id = 3143;
-- ID 3529: French é
UPDATE customers
SET name = 'Société des Mines de Syama'
WHERE id = 3529;
-- ID 3633: Special space (em space)
UPDATE customers
SET name = 'P.J. Berriman & Co PTY LTD'
WHERE id = 3633;
-- ID 4325: Danish ø
UPDATE customers
SET name = 'Rambøll Danmark'
WHERE id = 4325;
-- ID 4350: Turkish characters İ, Ş, Ğ
UPDATE customers
SET name = 'SONNIVA ENERGY MAKİNA İNŞAAT İTH. İHR. LTD. ŞTİ.'
WHERE id = 4350;
-- ID 4669: French é
UPDATE customers
SET name = 'F.C.C. Fluides Conseils Calédonie SARL'
WHERE id = 4669;
-- ID 4743: Czech ě
UPDATE customers
SET name = 'DGPack Prostějov'
WHERE id = 4743;
-- ID 4764: Zero-width space
UPDATE customers
SET name = 'Mccready Welding Pty Ltd Trading under the entity Mccready''s Welding Services'
WHERE id = 4764;
-- ID 4893: Spanish í
UPDATE customers
SET name = 'Oxiquímica, S.A.P.I. de C.V.'
WHERE id = 4893;
-- ----------------------------------------------------------------------------
-- ADDRESSES - Specific fixes
-- ----------------------------------------------------------------------------
-- ID 19: En dash (711)
UPDATE addresses
SET address = 'Lvl 3, Building B, 711 Talavera Road'
WHERE id = 19;
-- ============================================================================
-- PART 3: Verification Queries
-- ============================================================================
-- Run these after the fixes to verify they worked
-- Check fixed principles
SELECT id, name, city FROM principles WHERE id IN (2, 9, 13, 14, 58, 65);
-- Check fixed customers
SELECT id, name FROM customers WHERE id IN (253, 1006, 1387, 1608, 2174, 2215, 2375, 3143, 3529, 4325, 4350, 4669, 4743, 4764, 4893);
-- Check fixed addresses
SELECT id, address FROM addresses WHERE id = 19;
-- Count remaining corrupted records (should be much lower)
SELECT COUNT(*) as remaining_corrupted_principles
FROM principles
WHERE name REGEXP '[^ -~]' OR address REGEXP '[^ -~]' OR city REGEXP '[^ -~]';
SELECT COUNT(*) as remaining_corrupted_customers
FROM customers
WHERE name REGEXP '[^ -~]';
SELECT COUNT(*) as remaining_corrupted_addresses
FROM addresses
WHERE address REGEXP '[^ -~]' OR city REGEXP '[^ -~]';
-- ============================================================================
-- COMPLETION
-- ============================================================================
SELECT 'Fix script completed! Review the verification queries above.' as status;

View file

@ -0,0 +1,15 @@
#!/bin/bash
# ============================================================================
# Generate Report of Remaining Corrupted Data
# ============================================================================
DB_USER="root"
DB_PASS="secureRootPassword"
DB_NAME="cmc"
CONTAINER="cmc-db"
docker exec $CONTAINER mariadb \
-u $DB_USER \
-p$DB_PASS \
--default-character-set=utf8mb4 \
$DB_NAME < scripts/report_remaining_corruption.sql

View file

@ -0,0 +1,191 @@
-- ============================================================================
-- Report Remaining Corrupted Data After Fix
-- ============================================================================
-- Run this after fix_corrupted_data.sql to find records that still need manual fixes
--
-- To run: docker exec cmc-db mariadb -u root -psecureRootPassword --default-character-set=utf8mb4 cmc < scripts/report_remaining_corruption.sql
-- ============================================================================
SET NAMES utf8mb4;
SELECT '============================================================' as '';
SELECT 'CORRUPTED DATA REPORT' as '';
SELECT '============================================================' as '';
-- Summary counts
SELECT '' as '';
SELECT 'SUMMARY OF REMAINING CORRUPTION' as '';
SELECT '-------------------------------' as '';
SELECT
'principles' as table_name,
COUNT(*) as corrupted_records,
'name, address, city' as affected_columns
FROM principles
WHERE name REGEXP '[^ -~]'
OR address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
UNION ALL
SELECT
'customers',
COUNT(*),
'name'
FROM customers
WHERE name REGEXP '[^ -~]'
UNION ALL
SELECT
'addresses',
COUNT(*),
'address, city'
FROM addresses
WHERE address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
UNION ALL
SELECT
'contacts',
COUNT(*),
'name'
FROM contacts
WHERE name REGEXP '[^ -~]'
UNION ALL
SELECT
'products',
COUNT(*),
'name, description'
FROM products
WHERE name REGEXP '[^ -~]'
OR description REGEXP '[^ -~]';
-- Detailed listings
SELECT '' as '';
SELECT '============================================================' as '';
SELECT 'DETAILED CORRUPTED RECORDS' as '';
SELECT '============================================================' as '';
-- Principles with corruption
SELECT '' as '';
SELECT 'PRINCIPLES TABLE - Corrupted Records:' as '';
SELECT '-------------------------------------' as '';
SELECT
id,
name,
LEFT(address, 50) as address_preview,
city
FROM principles
WHERE name REGEXP '[^ -~]'
OR address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
ORDER BY id;
-- Customers with corruption
SELECT '' as '';
SELECT 'CUSTOMERS TABLE - Corrupted Records:' as '';
SELECT '-------------------------------------' as '';
SELECT
id,
name
FROM customers
WHERE name REGEXP '[^ -~]'
ORDER BY id;
-- Addresses with corruption
SELECT '' as '';
SELECT 'ADDRESSES TABLE - Corrupted Records:' as '';
SELECT '-------------------------------------' as '';
SELECT
id,
LEFT(address, 60) as address,
city
FROM addresses
WHERE address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
ORDER BY id
LIMIT 50;
-- Contacts with corruption
SELECT '' as '';
SELECT 'CONTACTS TABLE - Corrupted Records:' as '';
SELECT '-------------------------------------' as '';
SELECT
id,
name
FROM contacts
WHERE name REGEXP '[^ -~]'
ORDER BY id
LIMIT 50;
-- Products with corruption
SELECT '' as '';
SELECT 'PRODUCTS TABLE - Corrupted Records:' as '';
SELECT '-------------------------------------' as '';
SELECT
id,
name,
LEFT(description, 50) as description_preview
FROM products
WHERE name REGEXP '[^ -~]'
OR description REGEXP '[^ -~]'
ORDER BY id
LIMIT 50;
-- Common corruption patterns found
SELECT '' as '';
SELECT '============================================================' as '';
SELECT 'COMMON CORRUPTION PATTERNS DETECTED' as '';
SELECT '============================================================' as '';
SELECT
'’' as mojibake_pattern,
'Smart apostrophe (')' as should_be,
COUNT(*) as occurrences_in_customers
FROM customers
WHERE name LIKE '%’%'
UNION ALL
SELECT
'“ or â€',
'Smart quotes (" ")',
COUNT(*)
FROM customers
WHERE name LIKE '%“%' OR name LIKE '%â€%'
UNION ALL
SELECT
'é, ü, ó, etc',
'Accented chars (é, ü, ó)',
COUNT(*)
FROM customers
WHERE name REGEXP 'Ã[©¼³±§]'
UNION ALL
SELECT
'â€" or â€"',
'Dashes ( —)',
COUNT(*)
FROM addresses
WHERE address LIKE '%â€"%' OR address LIKE '%â€"%'
UNION ALL
SELECT
'Ä, ž, Å¡, etc',
'Czech/Polish chars',
COUNT(*)
FROM principles
WHERE address REGEXP '[ÄÅ][¾¡›™]';
SELECT '' as '';
SELECT '============================================================' as '';
SELECT 'END OF REPORT' as '';
SELECT '============================================================' as '';

171
scripts/run_corruption_fix.sh Executable file
View file

@ -0,0 +1,171 @@
#!/bin/bash
# ============================================================================
# Safe Corruption Fix Execution Script
# ============================================================================
# This script:
# 1. Creates a backup
# 2. Shows current corruption status
# 3. Prompts for confirmation
# 4. Applies fixes
# 5. Shows before/after comparison
# ============================================================================
set -e # Exit on error
DB_USER="root"
DB_PASS="secureRootPassword"
DB_NAME="cmc"
CONTAINER="cmc-db"
BACKUP_DIR="./backups"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
echo -e "${BLUE}============================================================${NC}"
echo -e "${BLUE}CMC Database Corruption Fix Script${NC}"
echo -e "${BLUE}============================================================${NC}"
echo ""
# Create backups directory
mkdir -p "$BACKUP_DIR"
# Step 1: Create backup
echo -e "${YELLOW}Step 1: Creating database backup...${NC}"
docker exec $CONTAINER mariadb-dump \
-u $DB_USER \
-p$DB_PASS \
--default-character-set=utf8mb4 \
--single-transaction \
$DB_NAME | gzip > "$BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz"
BACKUP_SIZE=$(du -h "$BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz" | cut -f1)
echo -e "${GREEN}✓ Backup created: backup_before_corruption_fix_$TIMESTAMP.sql.gz (${BACKUP_SIZE})${NC}"
echo ""
# Step 2: Show current corruption status
echo -e "${YELLOW}Step 2: Current corruption status${NC}"
echo "-----------------------------------"
docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e "
SELECT
'principles' as table_name,
COUNT(*) as corrupted_records
FROM principles
WHERE name REGEXP '[^ -~]'
OR address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
UNION ALL
SELECT 'customers', COUNT(*)
FROM customers
WHERE name REGEXP '[^ -~]'
UNION ALL
SELECT 'addresses', COUNT(*)
FROM addresses
WHERE address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]';
"
echo ""
# Step 3: Show sample corrupted data
echo -e "${YELLOW}Step 3: Sample corrupted data (BEFORE fix)${NC}"
echo "-------------------------------------------"
docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e "
SELECT 'CUSTOMERS' as table_name, id, name FROM customers WHERE id IN (253, 1006, 1387, 1608)
UNION ALL
SELECT 'PRINCIPLES', id, name FROM principles WHERE id IN (2, 9, 13, 14);
"
echo ""
# Step 4: Prompt for confirmation
echo -e "${RED}IMPORTANT:${NC}"
echo "- Backup created at: $BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz"
echo "- This will modify data in tables: principles, customers, addresses"
echo "- Changes fix mojibake like: ’ → ', é → é, etc."
echo ""
read -p "Do you want to proceed with the fix? (yes/no): " CONFIRM
if [ "$CONFIRM" != "yes" ]; then
echo -e "${RED}Fix cancelled by user${NC}"
exit 0
fi
# Step 5: Apply fixes
echo ""
echo -e "${YELLOW}Step 5: Applying corruption fixes...${NC}"
docker exec -i $CONTAINER mariadb \
-u $DB_USER \
-p$DB_PASS \
--default-character-set=utf8mb4 \
$DB_NAME < scripts/fix_corrupted_data.sql
echo -e "${GREEN}✓ Fix script executed${NC}"
echo ""
# Step 6: Show results
echo -e "${YELLOW}Step 6: Results (AFTER fix)${NC}"
echo "----------------------------"
echo ""
echo "Sample fixed data:"
docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e "
SELECT 'CUSTOMERS' as table_name, id, name FROM customers WHERE id IN (253, 1006, 1387, 1608)
UNION ALL
SELECT 'PRINCIPLES', id, name FROM principles WHERE id IN (2, 9, 13, 14);
"
echo ""
echo "Remaining corruption counts:"
docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e "
SELECT
'principles' as table_name,
COUNT(*) as corrupted_records
FROM principles
WHERE name REGEXP '[^ -~]'
OR address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
UNION ALL
SELECT 'customers', COUNT(*)
FROM customers
WHERE name REGEXP '[^ -~]'
UNION ALL
SELECT 'addresses', COUNT(*)
FROM addresses
WHERE address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]';
"
echo ""
# Step 7: Test new insert
echo -e "${YELLOW}Step 7: Testing new inserts with special characters...${NC}"
docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e "
START TRANSACTION;
INSERT INTO principles (name, code, address, city, state, postcode, country_id, currency_id, short_name, url)
VALUES ('Test Müller & Söhne GmbH', 999999, 'Straße 123', 'München', 'Bayern', '80331', 1, 1, 'TEST', 'http://test.com');
SELECT name, address, city FROM principles WHERE code = 999999;
ROLLBACK;
"
echo -e "${GREEN}✓ New inserts working correctly${NC}"
echo ""
# Final summary
echo -e "${BLUE}============================================================${NC}"
echo -e "${GREEN}FIX COMPLETED SUCCESSFULLY!${NC}"
echo -e "${BLUE}============================================================${NC}"
echo ""
echo "Summary:"
echo "- Backup location: $BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz"
echo "- Fixed common patterns: smart quotes, accented characters, dashes"
echo "- Fixed specific records: ~30+ known corrupted records"
echo ""
echo "Next steps:"
echo "1. Review the fixed data in the application"
echo "2. Run: bash scripts/report_remaining_corruption.sh to see what's left"
echo "3. Manually fix any remaining corrupted records"
echo ""
echo "To rollback if needed:"
echo " gunzip < $BACKUP_DIR/backup_before_corruption_fix_$TIMESTAMP.sql.gz | \\"
echo " docker exec -i $CONTAINER mariadb -u $DB_USER -p$DB_PASS $DB_NAME"
echo ""

163
scripts/verify_charset_fix.sh Executable file
View file

@ -0,0 +1,163 @@
#!/bin/bash
# Script to verify character encoding fix in MariaDB
DB_USER="root"
DB_PASS="secureRootPassword"
DB_NAME="cmc"
CONTAINER="cmc-db"
echo "=========================================="
echo "Character Encoding Verification Script"
echo "=========================================="
echo ""
# Function to run MariaDB command
run_sql() {
docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS $DB_NAME -e "$1"
}
# Function to run MariaDB command with UTF-8 client
run_sql_utf8() {
docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS --default-character-set=utf8mb4 $DB_NAME -e "$1"
}
echo "1. Check database default charset"
echo "-----------------------------------"
docker exec $CONTAINER mariadb -u $DB_USER -p$DB_PASS -e "
SELECT @@character_set_database, @@collation_database;
"
echo ""
echo "2. Count tables by charset (BEFORE should show ~59 latin1)"
echo "------------------------------------------------------------"
run_sql "
SELECT
CASE
WHEN TABLE_COLLATION LIKE 'latin1%' THEN 'latin1'
WHEN TABLE_COLLATION LIKE 'utf8mb4%' THEN 'utf8mb4'
WHEN TABLE_COLLATION LIKE 'utf8mb3%' THEN 'utf8mb3'
ELSE TABLE_COLLATION
END as charset,
COUNT(*) as table_count
FROM information_schema.TABLES
WHERE TABLE_SCHEMA='$DB_NAME'
GROUP BY charset
ORDER BY table_count DESC;
"
echo ""
echo "3. Check specific table charsets (sample)"
echo "------------------------------------------"
run_sql "
SELECT TABLE_NAME, TABLE_COLLATION
FROM information_schema.TABLES
WHERE TABLE_SCHEMA='$DB_NAME'
AND TABLE_NAME IN ('principles', 'customers', 'addresses', 'contacts', 'products')
ORDER BY TABLE_NAME;
"
echo ""
echo "4. Show column charsets for principles table"
echo "---------------------------------------------"
run_sql "
SELECT COLUMN_NAME, CHARACTER_SET_NAME, COLLATION_NAME
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA='$DB_NAME'
AND TABLE_NAME='principles'
AND DATA_TYPE IN ('varchar', 'text', 'char')
ORDER BY ORDINAL_POSITION;
"
echo ""
echo "5. Test INSERT of special characters (will be rolled back)"
echo "-----------------------------------------------------------"
echo "Attempting to insert test record with special characters..."
run_sql_utf8 "
START TRANSACTION;
INSERT INTO principles (name, code, address, city, state, postcode, country_id, currency_id, short_name, url)
VALUES (
'Test Müller & Söhne GmbH', -- German umlauts
999999,
'Straße 123, 2. Stock', -- German ß and ² symbol
'München', -- German ü
'Bayern',
'80331',
1,
1,
'TEST',
'http://test.com'
);
-- Read it back immediately
SELECT id, name, address, city
FROM principles
WHERE code = 999999;
ROLLBACK;
" 2>&1 | grep -A 10 "id\|Test Müller\|ERROR"
echo "✓ Test insert rolled back"
echo ""
echo "6. Verify current corrupted data in principles"
echo "-----------------------------------------------"
echo "These should show corruption if NOT yet fixed:"
run_sql_utf8 "
SELECT id, name, city
FROM principles
WHERE id IN (9, 18);
"
echo ""
echo "7. Verify corrupted data in customers"
echo "--------------------------------------"
run_sql_utf8 "
SELECT id, name
FROM customers
WHERE id IN (253, 1006, 1387, 1608);
"
echo ""
echo "8. Count potentially corrupted records across tables"
echo "-----------------------------------------------------"
run_sql "
SELECT
'principles' as table_name,
COUNT(*) as rows_with_non_ascii
FROM principles
WHERE name REGEXP '[^ -~]'
OR address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
UNION ALL
SELECT
'customers',
COUNT(*)
FROM customers
WHERE name REGEXP '[^ -~]'
UNION ALL
SELECT
'addresses',
COUNT(*)
FROM addresses
WHERE address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
UNION ALL
SELECT
'contacts',
COUNT(*)
FROM contacts
WHERE firstname REGEXP '[^ -~]'
OR lastname REGEXP '[^ -~]';
"
echo ""
echo "=========================================="
echo "Verification complete!"
echo "=========================================="
echo ""
echo "INTERPRETATION:"
echo "- If table_count for 'latin1' is ~59: NOT YET FIXED"
echo "- If table_count for 'utf8mb4' is ~62: FIXED!"
echo "- Test insert should show proper characters if fixed"
echo "- Corrupted data will still show mojibake until manually repaired"
echo ""

View file

@ -0,0 +1,207 @@
-- ============================================================================
-- Manual MariaDB Character Encoding Verification Commands
-- ============================================================================
-- Run these commands directly in MariaDB to verify the charset fix
--
-- Connect to database:
-- docker exec -it cmc-db mariadb -u root -psecureRootPassword --default-character-set=utf8mb4 cmc
-- ============================================================================
-- 1. CHECK DATABASE-LEVEL SETTINGS
-- Should show: utf8mb4
SELECT @@character_set_database as db_charset,
@@collation_database as db_collation;
-- 2. COUNT TABLES BY CHARACTER SET
-- BEFORE FIX: ~59 latin1 tables
-- AFTER FIX: ~62 utf8mb4 tables
SELECT
CASE
WHEN TABLE_COLLATION LIKE 'latin1%' THEN 'latin1'
WHEN TABLE_COLLATION LIKE 'utf8mb4%' THEN 'utf8mb4'
WHEN TABLE_COLLATION LIKE 'utf8mb3%' THEN 'utf8mb3'
ELSE TABLE_COLLATION
END as charset,
COUNT(*) as table_count
FROM information_schema.TABLES
WHERE TABLE_SCHEMA='cmc'
GROUP BY charset
ORDER BY table_count DESC;
-- 3. CHECK SPECIFIC TABLES (Key business tables)
SELECT TABLE_NAME, TABLE_COLLATION, ENGINE
FROM information_schema.TABLES
WHERE TABLE_SCHEMA='cmc'
AND TABLE_NAME IN (
'principles', 'customers', 'addresses', 'contacts',
'products', 'quotes', 'purchase_orders', 'invoices'
)
ORDER BY TABLE_NAME;
-- 4. CHECK ALL COLUMN CHARSETS FOR 'principles' TABLE
-- All text columns should show utf8mb4 after fix
SELECT
COLUMN_NAME,
DATA_TYPE,
CHARACTER_SET_NAME,
COLLATION_NAME
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA='cmc'
AND TABLE_NAME='principles'
AND DATA_TYPE IN ('varchar', 'text', 'char')
ORDER BY ORDINAL_POSITION;
-- 5. TEST INSERT WITH SPECIAL CHARACTERS
-- This will be rolled back - just testing if charset accepts them
START TRANSACTION;
-- Insert test record with various special characters
INSERT INTO principles (
name, code, address, city, state, postcode,
country_id, currency_id, short_name, url
)
VALUES (
'Test Müller & Söhne GmbH', -- German umlauts (ü, ö)
999999,
'Straße 123, 2. Stock', -- German ß
'München', -- German ü
'Bayern',
'80331',
1, 1, 'TEST', 'http://test.com'
);
-- Read it back - should show properly if charset is correct
SELECT id, name, address, city
FROM principles
WHERE code = 999999;
-- Expected output if FIXED:
-- name: Test Müller & Söhne GmbH (proper umlauts)
-- address: Straße 123, 2. Stock
-- city: München
-- Expected output if NOT FIXED:
-- Characters will be corrupted or insert will fail
ROLLBACK; -- Don't actually save the test record
-- 6. VIEW KNOWN CORRUPTED RECORDS
-- These show what the corruption looks like before manual repair
-- Principle ID 9 - Czech characters corrupted
SELECT id, name, address, city
FROM principles
WHERE id = 9;
-- BEFORE FIX: BezruÄova 2901, Rožnov pod RadhoÅ¡tÄm
-- AFTER FIX (still needs manual repair): Same corruption
-- AFTER MANUAL REPAIR: Bezručova 2901, Rožnov pod Radhoštěm
-- Principle ID 18 - German umlaut in city
SELECT id, short_name, city, country_id
FROM principles
WHERE id = 18;
-- Customer corrupted names
SELECT id, name
FROM customers
WHERE id IN (253, 1006, 1387, 1608, 2174);
-- Look for: ’ (should be '), É (should be É), “ (should be ")
-- Address corrupted data
SELECT id, address, city
FROM addresses
WHERE id IN (19, 25);
-- Look for: â€" (should be )
-- 7. COUNT RECORDS WITH NON-ASCII CHARACTERS
-- Shows how many records might need manual repair after charset fix
SELECT
'principles' as table_name,
COUNT(*) as records_with_non_ascii
FROM principles
WHERE name REGEXP '[^ -~]'
OR address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
UNION ALL
SELECT 'customers', COUNT(*)
FROM customers
WHERE name REGEXP '[^ -~]'
UNION ALL
SELECT 'addresses', COUNT(*)
FROM addresses
WHERE address REGEXP '[^ -~]'
OR city REGEXP '[^ -~]'
UNION ALL
SELECT 'contacts', COUNT(*)
FROM contacts
WHERE firstname REGEXP '[^ -~]'
OR lastname REGEXP '[^ -~]';
-- 8. VERIFY CONNECTION CHARACTER SET
-- Your client should also be using utf8mb4
SHOW VARIABLES LIKE 'character_set_%';
SHOW VARIABLES LIKE 'collation_%';
-- Should see:
-- character_set_client: utf8mb4
-- character_set_connection: utf8mb4
-- character_set_results: utf8mb4
-- character_set_database: utf8mb4
-- ============================================================================
-- AFTER CONVERSION - TEST NEW INSERTS
-- ============================================================================
-- After running convert_charset_to_utf8mb4.sql, try these tests:
-- Test 1: Insert with German characters
START TRANSACTION;
INSERT INTO principles (name, code, address, city, state, postcode, country_id, currency_id, short_name, url)
VALUES ('Müller GmbH', 999991, 'Hauptstraße 1', 'München', 'BY', '80331', 1, 1, 'MUL', 'https://test.de');
SELECT * FROM principles WHERE code = 999991;
-- Should show: Müller, Hauptstraße, München (correctly)
ROLLBACK;
-- Test 2: Insert with French characters
START TRANSACTION;
INSERT INTO principles (name, code, address, city, state, postcode, country_id, currency_id, short_name, url)
VALUES ('Société François', 999992, 'Rue de la Paix', 'Orléans', 'FR', '45000', 1, 1, 'SOC', 'https://test.fr');
SELECT * FROM principles WHERE code = 999992;
-- Should show: Société, François, Orléans (correctly)
ROLLBACK;
-- Test 3: Insert with special punctuation
START TRANSACTION;
INSERT INTO customers (name, trading_name, abn, created, notes, discount_pricing_policies, payment_terms, customer_category_id, url, country_id)
VALUES ('O'Connor & Sons "Quality" Products', 'O'Connor Trading', '', NOW(), '', '', '', 1, '', 1);
SELECT name FROM customers WHERE name LIKE '%O''Connor%' ORDER BY id DESC LIMIT 1;
-- Should show: O'Connor & Sons — "Quality" Products (with proper apostrophes, em-dash, and quotes)
ROLLBACK;
-- ============================================================================
-- SUMMARY OF WHAT TO LOOK FOR
-- ============================================================================
/*
BEFORE FIX:
- Table charset: latin1_swedish_ci
- Column charset: latin1
- Test insert: May fail or corrupt characters
- Existing data: Shows mojibake (Ã, â, etc.)
AFTER FIX (charset conversion only):
- Table charset: utf8mb4_general_ci
- Column charset: utf8mb4
- Test insert: Succeeds with proper characters
- Existing data: STILL shows mojibake (needs manual repair)
AFTER MANUAL REPAIR:
- Everything displays correctly
- New inserts work properly
- Old data has been corrected
*/