Performing extract, transform, and load techniques to create a database of comic book characters and their powers.
This project was done using MySQL and Python 3. The data was extracted from the ComicVine API and the FiveThirtyEight Comic Characters Dataset.
#Import dependencies
import pandas as pd
import requests
import time
from sqlalchemy import create_engine
#create database connection
rds_connection_string = "<Your Connection String Here>"
engine = create_engine(f'mysql+pymysql://{rds_connection_string}')
engine.table_names()
#view raw data for marvel comic character dataset
marvel_df = pd.read_csv("data/marvel-wikia-data.csv")
marvel_df.head()
#view raw data for DC comic character dataset
dc_df = pd.read_csv('data/dc-wikia-data.csv')
dc_df.head()
#add universe value
marvel_df['universe'] = 'Marvel'
#split the comic character name and the real identity name into different columns
marv_new = marvel_df['name'].str.split('(', n = 1, expand = True)
marvel_df['character_name'] = marv_new[0]
marvel_df['character_name'] = marvel_df['character_name'].str.rstrip()
marvel_df['real_name'] = marv_new[1]
marvel_df['real_name'] = marvel_df['real_name'].str.replace(r')', '')
#keep only the desired columns
marvel_character_df = marvel_df[['name', 'character_name', 'real_name', 'universe'
,'ALIGN', 'EYE', 'HAIR', 'SEX', 'Year']]
marvel_character_df.head()
#add universe value
dc_df['universe']='DC'
#rename the year column to match the marvel dataset
dc_df['Year'] = dc_df['YEAR']
#slipt the comic character name and the real identity name into different columns
dc_new = dc_df['name'].str.split('(', n = 1, expand = True)
dc_df['character_name'] = dc_new[0]
dc_df['character_name'] = dc_df['character_name'].str.rstrip()
dc_df['real_name'] = dc_new[1]
dc_df['real_name'] = dc_df['real_name'].str.replace(r')', '')
#keep only the desired columns
dc_character_df = dc_df[['name', 'character_name', 'real_name', 'universe'
,'ALIGN', 'EYE', 'HAIR', 'SEX', 'Year']]
dc_character_df.head()
#create a master characters dataframe by concatenating the two data sets
characters_df = pd.concat([marvel_character_df, dc_character_df], join="inner")
characters_df = characters_df.reset_index(drop = True)
characters_df = characters_df.reset_index()
#remove unnecessary words from the columns
characters_df['ALIGN'] = dc_df['ALIGN'].str.replace(r' Characters', '')
characters_df['SEX'] = dc_df['SEX'].str.replace(r' Characters', '')
characters_df['EYE'] = dc_df['EYE'].str.replace(r' Eyes', '')
characters_df['HAIR'] = dc_df['HAIR'].str.replace(r' Hair', '')
characters_df.head()
#create a copy of the characters data frame to adjust the index to act as the primary key when loaded into the character table
character_cols = ['index','name','character_name','real_name','universe'
,'ALIGN','EYE','HAIR', 'SEX','Year']
characters_table = characters_df[character_cols].copy()
characters_table = characters_table.rename(columns = {'index' : 'character_id'})
characters_table['character_id'] = characters_table['character_id'] +1
#select the columns for to be inserted into the characters table
character_table = characters_table[['character_id','character_name', 'universe']]
character_table.head()
#insert into the characters table
character_table.to_sql(name = 'characters', con = engine, if_exists = 'append', index = False)
#create an attributes dataframe to manipulate the index value to act as the attribute_id
attributes_table = characters_df[character_cols].copy()
attributes_table = attributes_table.rename(columns = {'index' : 'attribute_id'})
attributes_table['attribute_id'] = attributes_table['attribute_id'] +1
#add the character_id as the foreign key
attributes_table['character_id'] = character_table['character_id']
#rename the columns to match the schema
attributes_table = attributes_table.rename(columns = {'ALIGN':'alignment','EYE':'eye_color'
, 'HAIR':'hair_color', 'SEX':'gender','Year':'first_appearance'})
#select the columns to insert into the database from the attributes dataframe
attribute_table = attributes_table[['attribute_id', 'real_name', 'character_id', 'alignment'
,'eye_color','hair_color','gender','first_appearance']]
attribute_table.head()
#verify the data. Looks like a hair color was in the eye color column
attribute_table['eye_color'].unique()
attribute_table.loc[attribute_table['eye_color']=='Auburn Hair']
#correct the eye color
attribute_table.loc[attribute_table['eye_color']=='Auburn Hair', ['hair_color']] = 'Auburn'
attribute_table.loc[attribute_table['eye_color']=='Auburn Hair', ['eye_color']] = 'Unknown'
attribute_table['eye_color'].unique()
#verify the data. Looks good
attribute_table['hair_color'].unique()
#insert the data into the char_attributes table
attribute_table.to_sql(name = 'char_attributes', con = engine, if_exists = 'append', index = False)
#connect to the API and return json
base_url = 'https://comicvine.gamespot.com/api/'
resource = 'powers/'
key = '<Your API KEY here>'
url_format = '&sort=name&format=json'
api_call = base_url + resource + key + url_format
user_agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(api_call, headers = user_agent).json()
#verify the output
api_df = pd.DataFrame(response['results'])
api_df.head()
#Use for loops to pull the powers and characters from the API call
power =[]
char = []
k = 0
for url in list(api_df['api_detail_url']):
ul = api_df['api_detail_url'][k]
ul = ul + key + url_format
rep = requests.get(ul, headers = user_agent).json()
k+=1
j=0
for r in rep['results']['characters']:
try:
chars = rep['results']['characters'][j]['name']
char.append(chars)
powers = rep['results']['name']
power.append(powers)
j+=1
except(IndexError):
print('done')
#load powers to a dataframe
powers_df= pd.DataFrame(power, columns = ['power'])
powers_df.head()
powers_df['power'].unique()
save_path = ''
data = powers_df
data['characters'] = char
data.head()
#Save the powers data to csv
#data.to_csv(save_path)
#close python connection to the file
#copy powers df to create powers table for loading
powers_table = powers_df.copy()
#drop duplicates
powers_table.drop_duplicates(inplace = True)
powers_table = powers_table.reset_index(drop = True)
powers_table = powers_table.reset_index()
powers_table = powers_table.rename(columns = {'index' : 'power_id'})
powers_table.head()
#set the primary key value
powers_table['power_id'] = powers_table['power_id'] + 1
#load the powers to sql table
powers_table.to_sql(name = 'powers', con = engine, if_exists = 'append', index = False)
char_powers = powers_df.copy()
char_powers['character_name'] = char
char_powers = char_powers.reset_index(drop = True)
#merge the character list with the powers table
char_powers = char_powers.merge(powers_table, on = 'power', how = 'inner')
#remove duplicate character names from the character_table dataframe
#since it isn't possible to tell which character matches with the API data
characters_no_duplicates = character_table.drop_duplicates(subset = ['character_name'], keep = False)
#merge the data frames
char_powers = char_powers.merge(characters_no_duplicates, on = 'character_name', how = 'inner')
char_powers = char_powers.sort_values('character_id')
#remove duplicates but keep the first instance of value
char_powers = char_powers.drop_duplicates(keep ='first')
char_powers.head()
#select the key rows to be inserted into the database
char_powers_table = char_powers[['power_id', 'character_id']]
char_powers_table.head()
#populate the SQL table with character and power combination
char_powers_table.to_sql(name = 'char_powers', con = engine, if_exists = 'append', index = False)