Data Ingestion with pandas

Data Ingestion with pandas

1. Importing Data from Flat Files and Spreadsheets

read_csv for all flat files
import pandas as pd
data = pd.read_csv('file.csv')
data = pd.read_csv('file.tsv', sep='\t')

#choose columns to load by name
col_names = ['STATEFIPS', 'STATE', 'zipcode', 'agi_stub', 'N1']
data = pd.read_csv('file.csv', usecols=col_names)

#choose columns to load by number
col_nums = [0, 1, 2, 3, 4]
data = pd.read_csv('file.csv', usecols=col_nums)

#limiting rows, skip rows, no header in file
data = pd.read_csv('file.csv', nrows=1000, skiprows=1000, header=None)

#assigning column names
colnames = list(another_dataframe)
data = pd.read_csv('file.csv', header=None, names=colnames)

#specifyinf data types
data = pd.read_csv('file.csv', dtype={'zipcode': str})

#customizing missing data values (0 is NaN in the example)
data = pd.read_csv('file.csv', na_values={'zipcode' : 0})

#lines with errors
#with error_bad_lines skip lines with error
#with warn_bad_lines return a warning for bad lines
data = pd.read_csv('file.csv', error_bad_lines=False, warn_bad_lines=True)

#read_excel for excel files
data = pd.read_excel('file.xlsx')   # reads only the first sheet

#many parameters in common with read_csv
#nrows
#skiprows
#usecols: you can specify a range of letters of the spreadsheet file
#in the example I import only columns from W to AB plus AR
data = pd.read_excel('file.xlsx', usecols="W:AB, AR")

#two possibilities to taking the second sheet named sheet2
data = pd.read_excel('file.xlsx', sheet_name=1)   # zero-indexed
data = pd.read_excel('file.xlsx', sheet_name='sheet2')

#import first two sheets
data = pd.read_excel('file.xlsx', sheet_name=[0,1])   
data = pd.read_excel('file.xlsx', sheet_name=['sheet1','sheet2'])

#take all the sheets of a file (one data frame per sheet)
data = pd.read_excel('file.xlsx', sheet_name=None)

#with dtype specify column types
#true_values and false_values useful for convert boolean
data = pd.read_excel('file.xlsx',
                                    dtype = {'col5': bool,
                                                   'col6': bool}
                                    true_values = ["Yes"]
                                    false_values = ["No"]})
#NB: blank values are setting as True!

#for setting datetime columns not dtypes but parse_dates
#if is a standard datetime is enough this:
data = pd.read_excel('file.xlsx',
                                    parse_dates = ['col1','col2'])

#for non standard column date, es. 311299 for 31/12/1999
format_string = '%d%m%y'

2. Importing Data from Databases

import pandas as pd

#library to connect to databases
from sqlalchemy import create_engine

#create the database engine for the db data.db on sqlite
engine = create_engine('sqlite:///data.db')

#view the tables in the database
print(engine.table_names())

#load tab table without any SQL
tabloaded = pd.read_sql('tab', engine)

#create a SQL query to load data from a query
query = 'SELECT col1, col2 FROM tab WHERE col1 > 0'

#load weather with the SQL query
tabloaded = pd.read_sql(query, engine)

3. Importing JSON Data and Working with APIs

import pandas as pd

#load a JSON file to a data frame
jsondata = pd.read_json('file.json')

we can specify the orientation of the JSON
'split' : dict like {index -> [index], columns -> [columns], data -> [values]}
'records' : list like [{column -> value}, ... , {column -> value}]
'index' : dict like {index -> {column -> value}}
'columns' : dict like {column -> {index -> value}}
'values' : just the values array

jsondata = pd.read_json('file.json', orient = 'split')

Import data from an API

#requests library used for any url
import requests

#example: using the Yelp API
api_url = "https://api.yelp.com/v3/businesses/search"

#set up parameter dictionary according to documentation
params = {"term": "bookstore", "location": "San Francisco"}

#set up header dictionary w/ API key according to documentation
headers = {"Authorization": "Bearer {}".format(api_key)}

#call the API
response = requests.get(api_url, params=params ,headers=headers)

#isolate the JSON data from the response object
data = response.json()

#load businesses data to a data frame
df = pd.DataFrame(data["businesses"])

#library for reading nested json
from pandas.io.json import json_normalize

#flatten data and load to data frame, with _ separators
df = json_normalize(data["businesses"], sep="_")

#flatten categories data, bring in business details
df = json_normalize(data["businesses"], sep="_", record_path="categories",
             meta=["name", "alias", "rating", 
                       ["coordinates", "latitude"],["coordinates", "longitude"]],
                     meta_prefix="biz_")

4. Combining multiple datasets

import pandas as pd

#append datasets method
appended = df1.append(df2)

#set ignore_index to True to renumber rows
appended = df1.append(df2, ignore_index = True)

#merge(): pandas function and dataframe method for merging datasets
#default is like sql inner join (only values in both dataframes) 
merged = df1.merge(df2, on = 'col1')
merged = df1.merge(df2, left_on = 'col1', right_on = 'col2')