Comic Character ETL Project

Performing extract, transform, and load techniques to create a database of comic book characters and their powers.


This project was done using MySQL and Python 3. The data was extracted from the ComicVine API and the FiveThirtyEight Comic Characters Dataset.

In [6]:
#Import dependencies
import pandas as pd
import requests
import time
from sqlalchemy import create_engine
In [8]:
#create database connection
rds_connection_string = "<Your Connection String Here>"
engine = create_engine(f'mysql+pymysql://{rds_connection_string}')

engine.table_names()
Out[8]:
['char_attributes', 'char_powers', 'characters', 'powers']
In [31]:
#view raw data for marvel comic character dataset
marvel_df = pd.read_csv("data/marvel-wikia-data.csv")
marvel_df.head()
Out[31]:
page_id name urlslug ID ALIGN EYE HAIR SEX GSM ALIVE APPEARANCES FIRST APPEARANCE Year
0 1678 Spider-Man (Peter Parker) \/Spider-Man_(Peter_Parker) Secret Identity Good Characters Hazel Eyes Brown Hair Male Characters NaN Living Characters 4043.0 Aug-62 1962.0
1 7139 Captain America (Steven Rogers) \/Captain_America_(Steven_Rogers) Public Identity Good Characters Blue Eyes White Hair Male Characters NaN Living Characters 3360.0 Mar-41 1941.0
2 64786 Wolverine (James \"Logan\" Howlett) \/Wolverine_(James_%22Logan%22_Howlett) Public Identity Neutral Characters Blue Eyes Black Hair Male Characters NaN Living Characters 3061.0 Oct-74 1974.0
3 1868 Iron Man (Anthony \"Tony\" Stark) \/Iron_Man_(Anthony_%22Tony%22_Stark) Public Identity Good Characters Blue Eyes Black Hair Male Characters NaN Living Characters 2961.0 Mar-63 1963.0
4 2460 Thor (Thor Odinson) \/Thor_(Thor_Odinson) No Dual Identity Good Characters Blue Eyes Blond Hair Male Characters NaN Living Characters 2258.0 Nov-50 1950.0
In [32]:
#view raw data for DC comic character dataset
dc_df = pd.read_csv('data/dc-wikia-data.csv')
dc_df.head()
Out[32]:
page_id name urlslug ID ALIGN EYE HAIR SEX GSM ALIVE APPEARANCES FIRST APPEARANCE YEAR
0 1422 Batman (Bruce Wayne) \/wiki\/Batman_(Bruce_Wayne) Secret Identity Good Characters Blue Eyes Black Hair Male Characters NaN Living Characters 3093.0 1939, May 1939.0
1 23387 Superman (Clark Kent) \/wiki\/Superman_(Clark_Kent) Secret Identity Good Characters Blue Eyes Black Hair Male Characters NaN Living Characters 2496.0 1986, October 1986.0
2 1458 Green Lantern (Hal Jordan) \/wiki\/Green_Lantern_(Hal_Jordan) Secret Identity Good Characters Brown Eyes Brown Hair Male Characters NaN Living Characters 1565.0 1959, October 1959.0
3 1659 James Gordon (New Earth) \/wiki\/James_Gordon_(New_Earth) Public Identity Good Characters Brown Eyes White Hair Male Characters NaN Living Characters 1316.0 1987, February 1987.0
4 1576 Richard Grayson (New Earth) \/wiki\/Richard_Grayson_(New_Earth) Secret Identity Good Characters Blue Eyes Black Hair Male Characters NaN Living Characters 1237.0 1940, April 1940.0

ETL

In [33]:
#add universe value
marvel_df['universe'] = 'Marvel'

#split the comic character name and the real identity name into different columns
marv_new = marvel_df['name'].str.split('(', n = 1, expand = True)
marvel_df['character_name'] = marv_new[0]
marvel_df['character_name'] = marvel_df['character_name'].str.rstrip()
marvel_df['real_name'] = marv_new[1]
marvel_df['real_name'] = marvel_df['real_name'].str.replace(r')', '')

#keep only the desired columns
marvel_character_df = marvel_df[['name', 'character_name', 'real_name', 'universe'
                                 ,'ALIGN', 'EYE', 'HAIR', 'SEX', 'Year']]
marvel_character_df.head()
Out[33]:
name character_name real_name universe ALIGN EYE HAIR SEX Year
0 Spider-Man (Peter Parker) Spider-Man Peter Parker Marvel Good Characters Hazel Eyes Brown Hair Male Characters 1962.0
1 Captain America (Steven Rogers) Captain America Steven Rogers Marvel Good Characters Blue Eyes White Hair Male Characters 1941.0
2 Wolverine (James \"Logan\" Howlett) Wolverine James \"Logan\" Howlett Marvel Neutral Characters Blue Eyes Black Hair Male Characters 1974.0
3 Iron Man (Anthony \"Tony\" Stark) Iron Man Anthony \"Tony\" Stark Marvel Good Characters Blue Eyes Black Hair Male Characters 1963.0
4 Thor (Thor Odinson) Thor Thor Odinson Marvel Good Characters Blue Eyes Blond Hair Male Characters 1950.0
In [34]:
#add universe value
dc_df['universe']='DC'

#rename the year column to match the marvel dataset
dc_df['Year'] = dc_df['YEAR']

#slipt the comic character name and the real identity name into different columns
dc_new = dc_df['name'].str.split('(', n = 1, expand = True)
dc_df['character_name'] = dc_new[0]
dc_df['character_name'] = dc_df['character_name'].str.rstrip()
dc_df['real_name'] = dc_new[1]
dc_df['real_name'] = dc_df['real_name'].str.replace(r')', '')

#keep only the desired columns
dc_character_df = dc_df[['name', 'character_name', 'real_name', 'universe'
                         ,'ALIGN', 'EYE', 'HAIR', 'SEX', 'Year']]
dc_character_df.head()
Out[34]:
name character_name real_name universe ALIGN EYE HAIR SEX Year
0 Batman (Bruce Wayne) Batman Bruce Wayne DC Good Characters Blue Eyes Black Hair Male Characters 1939.0
1 Superman (Clark Kent) Superman Clark Kent DC Good Characters Blue Eyes Black Hair Male Characters 1986.0
2 Green Lantern (Hal Jordan) Green Lantern Hal Jordan DC Good Characters Brown Eyes Brown Hair Male Characters 1959.0
3 James Gordon (New Earth) James Gordon New Earth DC Good Characters Brown Eyes White Hair Male Characters 1987.0
4 Richard Grayson (New Earth) Richard Grayson New Earth DC Good Characters Blue Eyes Black Hair Male Characters 1940.0
In [ ]:
 
In [35]:
#create a master characters dataframe by concatenating the two data sets
characters_df = pd.concat([marvel_character_df, dc_character_df], join="inner")
characters_df = characters_df.reset_index(drop = True)
characters_df = characters_df.reset_index()

#remove unnecessary words from the columns
characters_df['ALIGN'] = dc_df['ALIGN'].str.replace(r' Characters', '')
characters_df['SEX'] = dc_df['SEX'].str.replace(r' Characters', '')
characters_df['EYE'] = dc_df['EYE'].str.replace(r' Eyes', '')
characters_df['HAIR'] = dc_df['HAIR'].str.replace(r' Hair', '')


characters_df.head()
Out[35]:
index name character_name real_name universe ALIGN EYE HAIR SEX Year
0 0 Spider-Man (Peter Parker) Spider-Man Peter Parker Marvel Good Blue Black Male 1962.0
1 1 Captain America (Steven Rogers) Captain America Steven Rogers Marvel Good Blue Black Male 1941.0
2 2 Wolverine (James \"Logan\" Howlett) Wolverine James \"Logan\" Howlett Marvel Good Brown Brown Male 1974.0
3 3 Iron Man (Anthony \"Tony\" Stark) Iron Man Anthony \"Tony\" Stark Marvel Good Brown White Male 1963.0
4 4 Thor (Thor Odinson) Thor Thor Odinson Marvel Good Blue Black Male 1950.0
In [38]:
#create a copy of the characters data frame to adjust the index to act as the primary key when loaded into the character table
character_cols = ['index','name','character_name','real_name','universe'
                  ,'ALIGN','EYE','HAIR', 'SEX','Year']

characters_table = characters_df[character_cols].copy()
characters_table = characters_table.rename(columns = {'index' : 'character_id'})
characters_table['character_id'] = characters_table['character_id'] +1

#select the columns for to be inserted into the characters table
character_table = characters_table[['character_id','character_name', 'universe']]
character_table.head()
Out[38]:
character_id character_name universe
0 1 Spider-Man Marvel
1 2 Captain America Marvel
2 3 Wolverine Marvel
3 4 Iron Man Marvel
4 5 Thor Marvel
In [14]:
#insert into the characters table
character_table.to_sql(name = 'characters', con = engine, if_exists = 'append', index = False)
In [39]:
#create an attributes dataframe to manipulate the index value to act as the attribute_id
attributes_table = characters_df[character_cols].copy()
attributes_table = attributes_table.rename(columns = {'index' : 'attribute_id'})
attributes_table['attribute_id'] = attributes_table['attribute_id'] +1

#add the character_id as the foreign key
attributes_table['character_id'] = character_table['character_id'] 

#rename the columns to match the schema
attributes_table = attributes_table.rename(columns = {'ALIGN':'alignment','EYE':'eye_color'
                                                      , 'HAIR':'hair_color', 'SEX':'gender','Year':'first_appearance'})

#select the columns to insert into the database from the attributes dataframe
attribute_table = attributes_table[['attribute_id', 'real_name', 'character_id', 'alignment'
                                    ,'eye_color','hair_color','gender','first_appearance']]
attribute_table.head()
Out[39]:
attribute_id real_name character_id alignment eye_color hair_color gender first_apperance
0 1 Peter Parker 1 Good Blue Black Male 1962.0
1 2 Steven Rogers 2 Good Blue Black Male 1941.0
2 3 James \"Logan\" Howlett 3 Good Brown Brown Male 1974.0
3 4 Anthony \"Tony\" Stark 4 Good Brown White Male 1963.0
4 5 Thor Odinson 5 Good Blue Black Male 1950.0
In [40]:
#verify the data. Looks like a hair color was in the eye color column
attribute_table['eye_color'].unique()
Out[40]:
array(['Blue', 'Brown', 'Green', 'Purple', 'Black', 'White', 'Red',
       'Photocellular', 'Hazel', 'Amber', 'Yellow', nan, 'Grey', 'Pink',
       'Violet', 'Gold', 'Orange', 'Auburn Hair'], dtype=object)
In [44]:
attribute_table.loc[attribute_table['eye_color']=='Auburn Hair']
Out[44]:
attribute_id real_name character_id alignment eye_color hair_color gender first_apperance
872 873 Earth-616 873 Good Auburn Hair NaN Female 1983.0
1776 1777 Earth-616 1777 Neutral Auburn Hair NaN Male 1968.0
3600 3601 Earth-616 3601 Good Auburn Hair NaN Female 2007.0
3623 3624 Earth-616 3624 Bad Auburn Hair NaN Female 2010.0
3627 3628 Earth-616 3628 Bad Auburn Hair NaN Female 2011.0
4070 4071 Earth-616 4071 Neutral Auburn Hair NaN Female 1973.0
5346 5347 Earth-616 5347 NaN Auburn Hair NaN Female 1986.0
In [45]:
#correct the eye color
attribute_table.loc[attribute_table['eye_color']=='Auburn Hair', ['hair_color']] = 'Auburn'
attribute_table.loc[attribute_table['eye_color']=='Auburn Hair', ['eye_color']] = 'Unknown'
attribute_table['eye_color'].unique()
c:\conda\envs\finance3\lib\site-packages\pandas\core\indexing.py:543: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
Out[45]:
array(['Blue', 'Brown', 'Green', 'Purple', 'Black', 'White', 'Red',
       'Photocellular', 'Hazel', 'Amber', 'Yellow', nan, 'Grey', 'Pink',
       'Violet', 'Gold', 'Orange', 'Unknown'], dtype=object)
In [41]:
#verify the data. Looks good
attribute_table['hair_color'].unique()
Out[41]:
array(['Black', 'Brown', 'White', 'Blond', 'Red', nan, 'Green',
       'Strawberry Blond', 'Grey', 'Silver', 'Orange', 'Purple', 'Gold',
       'Blue', 'Reddish Brown', 'Pink', 'Violet', 'Platinum Blond'],
      dtype=object)
In [46]:
#insert the data into the char_attributes table
attribute_table.to_sql(name = 'char_attributes', con = engine, if_exists = 'append', index = False)
In [48]:
#connect to the API and return json

base_url = 'https://comicvine.gamespot.com/api/'
resource = 'powers/'
key = '<Your API KEY here>'
url_format = '&sort=name&format=json'

api_call = base_url + resource + key + url_format

user_agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

response = requests.get(api_call, headers = user_agent).json()
In [49]:
#verify the output
api_df = pd.DataFrame(response['results'])
api_df.head()
Out[49]:
aliases api_detail_url date_added date_last_updated description id name site_detail_url
0 None https://comicvine.gamespot.com/api/power/4035-66/ 2008-06-06 11:28:15 2008-06-06 11:28:15 None 66 Adaptive https://comicvine.gamespot.com/characters/?wik...
1 None https://comicvine.gamespot.com/api/power/4035-4/ 2008-06-06 11:28:15 2008-06-06 11:28:15 <p>Agility is the ability to move nimbly or qu... 4 Agility https://comicvine.gamespot.com/characters/?wik...
2 None https://comicvine.gamespot.com/api/power/4035-87/ 2008-06-06 11:28:15 2008-06-06 11:28:15 None 87 Animal Control https://comicvine.gamespot.com/characters/?wik...
3 None https://comicvine.gamespot.com/api/power/4035-68/ 2008-06-06 11:28:15 2008-06-06 11:28:15 None 68 Animation https://comicvine.gamespot.com/characters/?wik...
4 None https://comicvine.gamespot.com/api/power/4035-60/ 2008-06-06 11:28:15 2008-06-06 11:28:15 None 60 Astral Projection https://comicvine.gamespot.com/characters/?wik...
In [ ]:

In [50]:
#Use for loops to pull the powers and characters from the API call
power =[]
char = []
k = 0
for url in list(api_df['api_detail_url']):
    ul = api_df['api_detail_url'][k]
    ul = ul + key + url_format
    rep = requests.get(ul, headers = user_agent).json()
    k+=1
    j=0
    for r in rep['results']['characters']:
        try:
            chars = rep['results']['characters'][j]['name']
            char.append(chars)
            powers = rep['results']['name']
            power.append(powers)
            j+=1
        except(IndexError):
            print('done')
In [51]:
#load powers to a dataframe
powers_df= pd.DataFrame(power, columns = ['power'])
powers_df.head()
Out[51]:
power
0 Adaptive
1 Adaptive
2 Adaptive
3 Adaptive
4 Adaptive
In [52]:
powers_df['power'].unique()
Out[52]:
array(['Adaptive', 'Agility', 'Animal Control', 'Animation',
       'Astral Projection', 'Berserker Strength', 'Blast Power',
       'Blood Control', 'Chameleon', 'Chemical Absorbtion',
       'Chemical Secretion', 'Claws', 'Controlled Bone Growth',
       'Cosmic Awareness', 'Danger Sense', 'Darkforce Manipulation',
       'Darkness Manipulation', 'Death Touch', 'Density Control',
       'Dimensional Manipulation', 'Divine Powers', 'Duplication',
       'Earth Manipulation', 'Elasticity', 'Electricity Control',
       'Electronic Disruption', 'Electronic interaction',
       'Emotion Control', 'Empathy', 'Energy Absorption',
       'Energy Based Constructs', 'Energy Manipulation', 'Energy Shield',
       'Energy-Enhanced Strike', 'Enhance Mutation', 'Escape Artist',
       'Feral', 'Fire Control', 'Flame Breath', 'Flight', 'Force Field',
       'Gadgets', 'Genetic Manipulation', 'Gravity control', 'Healing',
       'Heat Generation', 'Heat Vision', 'Hellfire Control',
       'Holographic Projection', 'Hypnosis', 'Ice Breath', 'Ice Control',
       'Illusion Casting', 'Immortal', 'Implants', 'Inertia Absorption',
       'Insanely Rich', 'Intellect', 'Invisibility', 'Invulnerability',
       'Leadership', 'Levitation', 'Light Projection', 'Longevity',
       'Magic', 'Magnetism', 'Marksmanship', 'Matter Absorption',
       'Mesmerize', 'Necromancy', 'Omni-lingual', 'Penance Stare',
       'Phasing / Ghost', 'Pheromone Control', 'Plant Control',
       'Poisonous', 'Possession', 'Postcognition', 'Power Item',
       'Power Mimicry', 'Power Suit', 'Precognition', 'Prehensile Hair',
       'Probability Manipulation', 'Psionic', 'Psychic', 'Psychometry',
       'Radar Sense', 'Radiation', 'Reality Manpulation',
       'Sand manipulation', 'Sense Death', 'Shadowmeld', 'Shape Shifter',
       'Siphon Abilities', 'Siphon Lifeforce', 'Size Manipulation',
       'Sonic Scream', 'Soul Absorption', 'Stamina'], dtype=object)
In [59]:
save_path = ''

data = powers_df
data['characters'] = char

data.head()
Out[59]:
power characters
0 Adaptive Brainiac 5
1 Adaptive Jason
2 Adaptive Man-Witch
3 Adaptive Bravestarr
4 Adaptive Captain America
In [62]:
#Save the powers data to csv
#data.to_csv(save_path)
#close python connection to the file
In [53]:
#copy powers df to create powers table for loading
powers_table = powers_df.copy()
#drop duplicates
powers_table.drop_duplicates(inplace = True)
powers_table = powers_table.reset_index(drop = True)
powers_table = powers_table.reset_index()
powers_table = powers_table.rename(columns = {'index' : 'power_id'})
powers_table.head()
Out[53]:
power_id power
0 0 Adaptive
1 1 Agility
2 2 Animal Control
3 3 Animation
4 4 Astral Projection
In [54]:
#set the primary key value
powers_table['power_id'] = powers_table['power_id'] + 1
In [55]:
#load the powers to sql table
powers_table.to_sql(name = 'powers', con = engine, if_exists = 'append', index = False)
In [ ]:

In [56]:
char_powers = powers_df.copy()
char_powers['character_name'] = char
char_powers = char_powers.reset_index(drop = True)

#merge the character list with the powers table
char_powers = char_powers.merge(powers_table, on = 'power', how = 'inner')

#remove duplicate character names from the character_table dataframe 
#since it isn't possible to tell which character matches with the API data
characters_no_duplicates = character_table.drop_duplicates(subset = ['character_name'], keep = False)

#merge the data frames
char_powers = char_powers.merge(characters_no_duplicates, on = 'character_name', how = 'inner')
char_powers = char_powers.sort_values('character_id')

#remove duplicates but keep the first instance of value
char_powers = char_powers.drop_duplicates(keep ='first')
char_powers.head()
Out[56]:
power character_name power_id character_id universe
5304 Healing Spider-Man 45 1 Marvel
5308 Stamina Spider-Man 100 1 Marvel
5307 Radar Sense Spider-Man 88 1 Marvel
5306 Power Suit Spider-Man 81 1 Marvel
5305 Intellect Spider-Man 58 1 Marvel
In [57]:
#select the key rows to be inserted into the database
char_powers_table = char_powers[['power_id', 'character_id']]
char_powers_table.head()
Out[57]:
power_id character_id
5304 45 1
5308 100 1
5307 88 1
5306 81 1
5305 58 1
In [58]:
#populate the SQL table with character and power combination
char_powers_table.to_sql(name = 'char_powers', con = engine, if_exists = 'append', index = False)