= 'exam_db' DB_NAME
SQL Solution
Configuring SQL
Creating a new db instance and loading tables
Creating DB
Using mysql
library to connect as root user and create a new db instance
try:
= mysql.connector.connect(user='root', host='localhost')
connection = connection.cursor()
cursor f"CREATE DATABASE IF NOT EXISTS {DB_NAME};")
cursor.execute(except Error as e:
print(f"Error: {e}")
finally:
if connection.is_connected():
cursor.close() connection.close()
Since that was a lot of code just to execute 1 line of SQL, I’m going to create a new function that will make it easier
= os.getenv("DB_USER")
DB_USER = os.getenv("DB_PASSWORD") DB_PASSWORD
insert_sql
insert_sql (sql, hide_output=False)
Inserts data into the MySQL database.
insert_proc
insert_proc (sql, proc_name, call=True, hide_output=False)
Create a stored procedure and call it
Loading Datasets Into Database
Using prefix ‘temp_’
Mapping DataFrames to table names
= {'temp_contact_methods': contact_methods, 'temp_contacts': contacts, 'temp_gifts': gifts} tables
= create_engine(f'mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@localhost/{DB_NAME}') engine
try:
for table_name, df in tables.items():
try:
=False, if_exists='fail')
df.to_sql(table_name, engine, indexexcept ValueError:
print(f"Table {table_name} already exists. Skipping.")
except Error as e:
print(f"Error: {e}")
finally:
if connection.is_connected():
cursor.close() connection.close()
Table temp_contact_methods already exists. Skipping.
Table temp_contacts already exists. Skipping.
Table temp_gifts already exists. Skipping.
Previewing the data
'select * from temp_gifts limit 5', engine) pd.read_sql(
DonorNumber | LegacyGiftId | FirstName | LastName | AmountReceived | CreditCardType | PaymentMethod | LegacyPledgeID | Notes | Project1Code | Project2Code | GiftDate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 848348568-0 | 95196378 | Mannie | Turpin | 4.1500 | PayPal | 0 | None | 2019-03-04 | |||
1 | 729707142-0 | 95196889 | Cymbre | Cross | 2.3648 | check | 1 | ChildSponsorship | None | 2019-03-05 | ||
2 | 687119652-8 | 95197689 | Ruggiero | Makepeace | 1.3100 | cash | 2 | None | 2019-03-07 | |||
3 | 653377813-7 | 95198998 | Karita | Lumbers | 2.0400 | American Ex | credit card | 3 | In honor of Mannie Turpin | None | 2019-03-10 | |
4 | 390551098-7 | 95198999 | Helga | Benech | 5.8000 | cash | 89752384 | None | 2019-01-10 |
Solution
Creating procedures for each modification
ContactType
is required and can only be Household or Organization
Source Table:
- Contacts Table
Solution:
1. Create procedure to add new column ContactType
I’m going to write a procedure that would perform this transformation on the temp table.
This process is going to require me to do 2 things:
1. Add a column to the table
2. Insert data into that column
Adding a column to a table can be a lengthy process since MySQL
doesn’t provide a IF NOT EXISTS
option for adding a column.
Asa result I’m going to write 2 procedures to solve this problem:
1. add_column
procedure
2. insert_contact_type
procedure
= """
add_column DROP PROCEDURE IF EXISTS add_column;
CREATE PROCEDURE add_column(
IN tableName VARCHAR(255),
IN columnName VARCHAR(255),
IN columnType VARCHAR(255)
)
BEGIN
DECLARE columnExists BOOLEAN DEFAULT FALSE;
SELECT COUNT(*)
INTO columnExists
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = tableName AND COLUMN_NAME = columnName;
IF columnExists = 0 THEN
SET @sql = CONCAT('ALTER TABLE ', tableName, ' ADD COLUMN ', columnName, ' ', columnType);
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
END IF;
COMMIT;
END;
"""
insert_sql(add_column)
MySQL connection closed
= """
insert_contact_type CALL add_column('temp_contacts', 'ContactType', 'VARCHAR(255)');
UPDATE temp_contacts
SET ContactType = CASE WHEN CompanyName = '' THEN 'Household' ELSE 'Organization' END;
COMMIT;
"""
'insert_contact_type', call=True) insert_proc(insert_contact_type,
MySQL connection closed
MySQL connection closed
MySQL connection closed
Private
Does someone want to be private
Source Table:
- Contacts Table
Solution:
1. Create procedure to add new column Private
= """
insert_private CALL add_column('temp_contacts', 'Private', 'TINYINT');
UPDATE temp_contacts
SET Private = CASE WHEN Remarks = 'Is anonymous' THEN 1 ELSE 0 END;
commit;
"""
'insert_private', call=True) insert_proc(insert_private,
MySQL connection closed
MySQL connection closed
MySQL connection closed
Postal Code
if address is present and is US, must be a valid zip code, either 12345 or 12345-1234
Source Table:
- Contacts
Solution:
1. Create procedure to remove any postal codees that doesn’t match the approved format from the usps
= f"""
contacts_view CREATE OR REPLACE VIEW contacts_view AS
SELECT
NUMBER,
POSTAL AS OLD_POSTAL,
CASE
WHEN REGEXP_LIKE(Postal, '^[0-9]{{5}}$') OR REGEXP_LIKE(Postal, '^[0-9]{{5}}-[0-9]{{4}}$') THEN Postal
ELSE ''
END AS NEW_POSTAL
FROM temp_contacts;
"""
insert_sql(contacts_view)"SELECT * FROM contacts_view where old_postal != '' limit 5", con=engine) pd.read_sql(
MySQL connection closed
NUMBER | OLD_POSTAL | NEW_POSTAL | |
---|---|---|---|
0 | 653377813-7 | 20535-871 | |
1 | 390551098-7 | 89130 | 89130 |
2 | 729707142-0 | 8104 | |
3 | 488464926-5 | 49560 | 49560 |
4 | 315297729-8 | 837016 |
Success!
Creating Stored Procedure…
= """
update_zip UPDATE temp_contacts
SET Postal = ''
WHERE Postal NOT REGEXP '^[0-9]{5}$' AND Postal NOT REGEXP '^[0-9]{5}-[0-9]{4}$';
COMMIT;
"""
'update_zip', call=True) insert_proc(update_zip,
MySQL connection closed
MySQL connection closed
MySQL connection closed
IsDeceased
can only be TRUE or FALSE
Source Table:
- Contacts
Solution:
1. Create procedure to update Deceased to TRUE/FALSE
= f"""
update_deceased UPDATE temp_contacts
SET Deceased = CASE
WHEN Deceased = 'Yes' THEN 1
ELSE 0
END;
commit;
ALTER TABLE temp_contacts MODIFY Deceased TINYINT;
commit;
"""
'update_deceased', call=True) insert_proc(update_deceased,
MySQL connection closed
MySQL connection closed
Error: 1292 (22007): Truncated incorrect DOUBLE value: 'Yes'
MySQL connection closed
GiftType
Can only be Cash, Check, Credit, Other, or Reversing Transaction
Source Table:
- Gifts
Solution:
1. Identify Incorrect Gift Types
2. Create procedure to replace invalid gift types
'Select Distinct PaymentMethod from temp_gifts', engine) pd.read_sql(
PaymentMethod | |
---|---|
0 | PayPal |
1 | check |
2 | cash |
3 | credit card |
4 | |
5 | money order |
It looks like there are several payment methods that don’t match the approved list. Additionally, the payment method ‘credit card’ will need to be mapped to ‘credit’
= f"""
update_gift_type UPDATE temp_gifts
SET PaymentMethod = CASE
WHEN AmountReceived < 0 THEN 'Reversing Transaction'
WHEN LOWER(TRIM(PaymentMethod)) = 'cash' THEN 'Cash'
WHEN LOWER(TRIM(PaymentMethod)) = 'check' THEN 'Check'
WHEN LOWER(TRIM(PaymentMethod)) LIKE 'credit%' THEN 'Credit'
ELSE 'Other'
END;
commit;
"""
'update_gift_type', call=True) insert_proc(update_gift_type,
MySQL connection closed
MySQL connection closed
MySQL connection closed
CreditCardType
Can only be Visa, Mastercard, AMEX, Discover
Solution:
1. Identify Incorrect Credit Types
2. Create procedure to replace invalid credit types
'select distinct CreditCardType from temp_gifts', engine) pd.read_sql(
CreditCardType | |
---|---|
0 | |
1 | American Ex |
2 | AMEX |
3 | Visa |
4 | Master card |
5 | Mastercard |
6 | Discover |
= f"""
proc UPDATE temp_gifts
SET CreditCardType = CASE
WHEN CreditCardType = 'Master card' THEN 'Mastercard'
else 'AMEX'
end
WHERE CreditCardType IN ('American Ex', 'Master car');
commit;
"""
'update_gift_type', call=True) insert_proc(proc,
MySQL connection closed
MySQL connection closed
MySQL connection closed
Execution
Creating Final Tables
[contact_methods, contacts, gifts]
Contacts
Creating the final contacts table
- The cleaning is done
- Creating contacts table and renaming various columns
Creating procedure, selecting 5 rows from the contacts table
Code
= """
proc
CREATE TABLE IF NOT EXISTS contacts (
`LegacyContactId` VARCHAR(255),
`LegacyIndividualId` VARCHAR(255),
`ContactType` VARCHAR(255),
`ContactName` VARCHAR(255),
`FirstName` VARCHAR(255),
`LastName` VARCHAR(255),
`SecondaryLegacyIndividualId` VARCHAR(255),
`SecondaryFirstName` VARCHAR(255),
`SecondaryLastName` VARCHAR(255),
`HomePhone` VARCHAR(255),
`HomeEmail` VARCHAR(255),
`Address1` VARCHAR(255),
`City` VARCHAR(255),
`State` VARCHAR(255),
`PostalCode` VARCHAR(255),
`IsPrivate` VARCHAR(255),
`IsDeceased` VARCHAR(255)
);
insert into contacts
SELECT
`Number` AS `LegacyContactId`,
`LegacyIndividualId`,
`ContactType`,
`ContactName`,
`FirstName`,
`LastName`,
`SecondaryLegacyIndividualId`,
`SecondaryFirstName`,
`SecondaryLastName`,
CASE WHEN
temp_contact_methods.`Phone` = ''
THEN temp_contacts.`Phone`
ELSE temp_contact_methods.`Phone`
END AS HomePhone,
CASE WHEN
temp_contact_methods.`EMail` = ''
THEN temp_contacts.`EMail`
ELSE temp_contact_methods.`EMail`
END AS HomeEmail,
`Street` AS `Address1`,
`City`,
`State`,
`Postal` AS `PostalCode`,
`Private` AS `IsPrivate`,
`Deceased` AS `IsDeceased`
FROM
temp_contacts
LEFT JOIN
temp_contact_methods ON temp_contact_methods.DonorNumber = temp_contacts.`Number`;
commit;
"""
'create_contacts', call=True, hide_output=True)
insert_proc(proc, 'select * from contacts limit 5', engine) pd.read_sql(
LegacyContactId | LegacyIndividualId | ContactType | ContactName | FirstName | LastName | SecondaryLegacyIndividualId | SecondaryFirstName | SecondaryLastName | HomePhone | HomeEmail | Address1 | City | State | PostalCode | IsPrivate | IsDeceased | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 653377813-7 | 0 | Household | Karita & Kelvin Lumbers | Karita | Lumbers | 1 | Kelvin | Lumbers | 832-442-4988 | kklumbers@yahoo.com | 4 Bunting Parkway | Washington | DC | 1 | 0 | |
1 | 390551098-7 | 2 | Household | Helga Benech | Helga | Benech | ebenech1@goodreads.com | 48684 Jenifer Way | Las Vegas | NV | 89130 | 0 | 0 | ||||
2 | 093004505-X | 3 | Household | Masha Butt Gow | Masha | Butt Gow | 818-323-9865 | 353 Schmedeman Park | Indianapolis | IN | 0 | 0 | |||||
3 | 729707142-0 | 4 | Organization | Cymbre Cross | Cymbre | Cross | 2055 Lakewood Parkway | Camden | NJ | 0 | 0 | ||||||
4 | 488464926-5 | 5 | Household | Hoyt Castille | Hoyt | Castille | fcastille4@timesonline.co.uk | 37 8th Trail | Grand Rapids | MI | 49560 | 0 | 0 |
Gifts
Creating the final gifts column
- The cleaning is done
- Creating the gifts table and renaming some columns
Code
= """
proc CREATE TABLE IF NOT EXISTS gifts (
LegacyContactId VARCHAR(255),
LegacyGiftId INTEGER,
GiftType TEXT,
GiftDate TEXT,
GiftAmount REAL,
Notes TEXT,
CreditCardType TEXT,
Project1Code TEXT,
Project2Code TEXT,
LegacyPledgeID INTEGER
);
insert into gifts
select
DonorNumber as LegacyContactId,
LegacyGiftId,
PaymentMethod as GiftType,
GiftDate,
AmountReceived as GiftAmount,
Notes,
CreditCardType,
Project1Code,
Project2Code,
LegacyPledgeID
from temp_gifts;
commit;
"""
'create_gifts', call=True, hide_output=True)
insert_proc(proc, 'select * from gifts limit 5', engine) pd.read_sql(
LegacyContactId | LegacyGiftId | GiftType | GiftDate | GiftAmount | Notes | CreditCardType | Project1Code | Project2Code | LegacyPledgeID | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 848348568-0 | 95196378 | Other | 2019-03-04 00:00:00 | 4.1500 | None | 0 | |||
1 | 729707142-0 | 95196889 | Check | 2019-03-05 00:00:00 | 2.3648 | ChildSponsorship | None | 1 | ||
2 | 687119652-8 | 95197689 | Cash | 2019-03-07 00:00:00 | 1.3100 | None | 2 | |||
3 | 653377813-7 | 95198998 | Credit | 2019-03-10 00:00:00 | 2.0400 | In honor of Mannie Turpin | AMEX | None | 3 | |
4 | 390551098-7 | 95198999 | Cash | 2019-01-10 00:00:00 | 5.8000 | None | 89752384 |
Contact Methods
Procedure to create contact_methods table
- Creating a contact_methods
- Inserting distinct values from the contacts table
Code
= """
proc CREATE TABLE IF NOT EXISTS contact_methods (
`LegacyContactId` VARCHAR(255),
`Type` VARCHAR(255),
`Value` VARCHAR(255)
);
BEGIN
DECLARE done INT DEFAULT FALSE;
DECLARE v_LegacyContactId VARCHAR(255);
DECLARE v_HomePhone VARCHAR(255);
DECLARE v_HomeEmail VARCHAR(255);
DECLARE v_Fax VARCHAR(255);
DECLARE cur CURSOR FOR
SELECT DISTINCT
contacts.`LegacyContactId` AS LegacyContactId,
contacts.`HomePhone` AS HomePhone,
contacts.`HomeEmail` AS HomeEmail,
temp_contact_methods.Fax AS fax
FROM
contacts
JOIN
temp_contact_methods ON temp_contact_methods.DonorNumber = contacts.`LegacyContactId`;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = 1;
OPEN cur;
read_loop: LOOP
FETCH cur INTO v_LegacyContactId, v_HomePhone, v_HomeEmail,v_Fax;
IF done THEN
LEAVE read_loop;
END IF;
INSERT INTO contact_methods (`LegacyContactId`, `Type`, `Value`) VALUES
(v_LegacyContactId, 'HomePhone', v_HomePhone),
(v_LegacyContactId, 'HomeEmail', v_HomeEmail),
(v_LegacyContactId, 'Fax', v_Fax);
END LOOP;
CLOSE cur;
END;
-- Delete records from contact methods where value is null or ''
DELETE FROM contact_methods WHERE `Value` IS NULL OR `Value` = '';
commit;
"""
'transform_contact_methods', call=True, hide_output=True)
insert_proc(proc, 'select * from contact_methods order by LegacyContactId', con=engine) pd.read_sql(
LegacyContactId | Type | Value | |
---|---|---|---|
0 | 029456846-8 | HomeEmail | jdoley6@telegraph.co.uk |
1 | 093004505-X | HomePhone | 818-323-9865 |
2 | 093004505-X | Fax | 818-156-7985 |
3 | 315297729-8 | HomeEmail | dmouncey9@cnn.com |
4 | 390551098-7 | HomeEmail | ebenech1@goodreads.com |
5 | 488464926-5 | HomeEmail | fcastille4@timesonline.co.uk |
6 | 653377813-7 | HomePhone | 832-442-4988 |
7 | 653377813-7 | HomeEmail | kklumbers@yahoo.com |
8 | 687119652-8 | HomeEmail | cmakepeace7@1688.com |
9 | 739131380-7 | Fax | 626-981-3874 |
10 | 848348568-0 | HomePhone | 702-844-9524 |
Export
import nbdev
'01_SQL_Solution.ipynb') nbdev.nbdev_export(