I just finished my 4th week (of 12) at the Metis Data Science Bootcamp in SoMa, where I will be holed up until the end of September training to become a data scientist. I'll be using this space to document my progress and offer up my thoughts, both related to data science and my other interests.
To kick things off, I'll do a quick recap of the 2nd project assigned as part of the program, dubbed Benson. The assignment was to generate a regression model predicting box office success for movies using data acquired from the web. Skills used included web scraping (using Beautiful Soup / Selenium), data manipulation with pandas, and our first taste of regression and machine learning with the ubiquitous scikit-learn library.
Scraping from Box Office Mojo
The first step was to acquire a large data set of movie characteristics (genre, actors, rating, etc.) and box office revenue numbers from the web. I chose to gather the entire set of movies profiled on Box Office Mojo, a total of 16,000+ films. This consisted of 4 main steps:
- Building a list of movie page URLs from the alphabetical index pages
- Visiting movie URLs with
BeautifulSoup
and retrieving raw HTML - Extracting relevant data (revenue figures, actors/directors, release dates)
- Storing data in pandas
DataFrames
and CSV files for analysis
Here is an example of a Box Office Mojo page for the new Ghostbusters release:
Code for scraping the movie URL list:
from bs4 import BeautifulSoup as bs
import requests
import re
import csv
url = 'http://www.boxofficemojo.com/movies/alphabetical.htm?letter=NUM'
response = requests.get(url)
soup = bs(response.text, 'lxml')
letters = soup.findAll('a', href=re.compile('letter='))
letters_index = ['http://www.boxofficemojo.com{}'.format(letter['href'])
for letter in letters][:int(len(letters)/2)]
letter_subpages = []
for letter_page in letters_index:
response = requests.get(letter_page)
soup = bs(response.text, 'lxml')
subpage_links = soup.findAll('a', href=re.compile('page='))
subpage_urls = ['http://www.boxofficemojo.com{}'.format(subpage['href'])
for subpage in subpage_links]
subpage_urls = subpage_urls[:int(len(subpage_urls)/2)]
letter_subpages.extend(subpage_urls)
letters_index.extend(letter_subpages)
letters_index = sorted(letters_index, lambda x: x)
print('Ready to scrape {} subpages!'.format(len(letters_index)))
header = ['Movie', 'URL']
movie_link_data = []
for letter_subpage_url in letters_index:
response = requests.get(letter_subpage_url)
soup = bs(response.text, 'lxml').find(id='body')
movie_hrefs = soup.findAll('a', href=re.compile('id='))
for movie_href in movie_hrefs:
row_dict = {}
row_dict['id'] = movie_href['href'].replace('/movies/?id=', '').replace('.htm', '')
row_dict['title'] = movie_href.text
movie_link_data.append(row_dict)
movie_page_df = pd.DataFrame(movie_link_data)
movie_page_df.to_csv('movie_page_urls.csv', index=False)
print('Collected titles and ids for {} movies!'.format(len(movie_page_df.index)))
Collected titles and ids for 16553 movies!
Extracting data from movie pages
BOM pages are nearly devoid of id
and class
tags, perhaps to deter users from scraping the site and gobbling up server bandwidth. This made the BeautifulSoup code rather ugly with lots of messy case handling:
from time import sleep
from random import randint
def scrape_movie(movie_id):
url = 'http://www.boxofficemojo.com/movies/?id={}.htm'.format(movie_id)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
break
else:
print('{}: Received status code {}.'.format(movie_id, response.status_code))
return None
soup = bs(response.text, 'lxml').find(id='body')
return soup
def get_revenue(movie_body):
domestic_revenue_tag = movie_body.find(text='Domestic:')
foreign_revenue_tag = movie_body.find(text='Foreign:')
if not domestic_revenue_tag:
domestic_revenue = None
else:
domestic_revenue = parse_currency(domestic_revenue_tag.parent.parent.next_sibling.next_sibling.text)
if not foreign_revenue_tag:
foreign_revenue = None
else:
foreign_revenue = parse_currency(foreign_revenue_tag.parent.parent.next_sibling.next_sibling.text.strip())
return {'domestic_revenue': domestic_revenue, 'foreign_revenue': foreign_revenue}
def get_players(movie_body):
players_table = movie_body.find(class_='mp_box_tab', text='The Players')
if not players_table:
return None
else:
players_table = players_table.next_sibling.next_sibling
director_table = players_table.findAll('a', href=re.compile('view=Director&id'))
if not director_table:
directors = None
else:
directors = [director['href'].replace('/people/chart/?view=Director&id=','').replace('.htm', '') for director in director_table]
writer_table = players_table.findAll('a', href=re.compile('view=Writer&id'))
if not writer_table:
writers = None
else:
writers = [writer['href'].replace('/people/chart/?view=writer&id=','').replace('.htm', '') for writer in writer_table]
composer_table = players_table.findAll('a', href=re.compile('view=Composer&id'))
if not composer_table:
composers = None
else:
composers = [composer['href'].replace('/people/chart/?view=Composer&id=','').replace('.htm', '') for composer in composer_table]
actor_table = players_table.findAll('a', href=re.compile('view=Actor&id'))
if not actor_table:
actors = None
else:
actors = [actor['href'].replace('/people/chart/?view=Actor&id=','').replace('.htm', '') for actor in actor_table]
players = {
'directors': directors,
'actors': actors,
'composers': composers,
}
return players
movie_data = []
with open('movie_data_scraped_2.csv', 'a') as csvfile:
field_names = ['actors',
'budget',
'composers',
'directors',
'distributor',
'opening_domestic',
'domestic_revenue',
'foreign_revenue',
'genres',
'genre_primary',
'id',
'mpaa_rating',
'release_date',
'runtime',
'opening_theaters',
'title']
writer = csv.DictWriter(csvfile, fieldnames=field_names)
# writer.writeheader()
for movie in movie_page_df[movie_page_df['scraped'] == False].index:
sleep(randint(1, 5) * 0.1 + 1)
try:
scrape_status = {}
movie_body = scrape_movie(movie)
if movie_body is None:
continue
movie_entry = parse_movie_data(movie_body)
movie_entry.update({'id': movie, 'title': movie_page_df.loc[movie].title})
print(movie + ': OK')
movie_data.append(movie_entry)
if len(movie_data) >= 50:
print('Writing to DataFrame!')
movie_data_df.append(pd.DataFrame.from_records(movie_data, index='id'))
print('Writing to CSV!')
for row_dict in movie_data:
writer.writerow(row_dict)
movie_data = []
scrape_status.update({'id': movie, 'status': 'OK'})
except Exception as e:
error_log[movie] = str(e)
print(str(e))
scrape_status.update({'id': movie, 'status': 'error'})
movie_scrape_status.append(scrape_status)
After some trial and error, my scraper was ready to grab 16k+ pages from the BOM website. Several hours later...
Examining 16,000 rows of movie data
After a few runs of cleaning and conversion, the data was more or less ready for analysis. Some of the steps involved:
- Standardizing currency variables (
$5 million
->5000000
,$500,000
->500000
) - Converting fields with multiple categorical values to
dict
s (['chloemoretz', 'minkakelly']
->{'chloemoritz
: 1, 'minkakelly': 1}`) - Standardizing revenues and budgets to June 2016 to adjust for inflation
How do we treat categorical variables?
One topic covered in class was vectorization of categorical variables into binary columns. Behold!
# Vectorize categorical variables
def vectorize(df, col_name, sparse=False, prefix=None):
vec = DictVectorizer()
dict_col = df.loc[:, col_name]
vec_sparse = vec.fit_transform(dict_col)
if sparse:
return vec_sparse
vec_array = vec_sparse.toarray()
columns = ['{}_{}'.format(prefix, feature_name) for feature_name in vec.get_feature_names()] if prefix else vec.get_feature_names()
vec_df = pd.DataFrame(data=vec_array, index=df.index, columns=columns)
return vec_df
Now we have 800 additional variables in the DataFrame, and can visualize the top actors by number of appearances. Are we really surprised?
actor_appearances = sorted([(actor_col, actor_df[actor_col].sum()) for actor_col in actor_df.columns.tolist()], key=lambda x: x[1], reverse=True)
actors = [actor_count[0] for actor_count in actor_appearances]
appearances = [actor_count[1] for actor_count in actor_appearances]
n_actors = 50
plt.figure(figsize=(10, 10))
# fig, ax = plt.subplots()
# ax.set_xticklabels(actors[:50], fontsize='small', rotation='vertical')
plt.xlabel('# of film appearances')
plt.ylabel('Top 50 actors in Box Office Mojo database')
plt.barh(np.arange(n_actors), appearances[:n_actors], 0.4, tick_label=[actor.replace('actor_', '') for actor in actors][:n_actors])
# plt.ticks(np.arange(n_actors), [actor.replace('actor_', '') for actor in actors][:n_actors], fontsize='x-small', rotation='vertical')
plt.gca().invert_yaxis()
plt.tick_params(axis='both', which='major', labelsize=12)
plt.title('Number of appearances by actor: top 50')
plt.show()
For the time being, I did some simple feature engineering to compress the actor appearance data into 2 columns:
actor_prev_count
: the # of previous films made by actors featured in a filmactor_prev_revenue
: the total revenue made by said previous films
def actors_prev_exp(row, actor_df):
actors_prev_movie_count = {}
for actor, _ in row['actors'].items():
appearances = actor_df.loc[(actor_df['actor_' + actor] > 0) & (actor_df['release_date'] < row['release_date'])]
actors_prev_movie_count[actor] = {'count': appearances.shape[0],
'revenue': appearances['revenue'].sum()}
row['actors_prev'] = actors_prev_movie_count
return row
df = df.apply(actors_prev_exp, axis=1, actor_df=actor_df)
df['actor_prev_count'] = df['actors_prev'].apply(lambda x: sum([v['count'] for v in list(x.values())]))
df['actor_prev_rev'] = df['actors_prev'].apply(lambda x: sum([v['revenue'] for v in list(x.values())]))
Exploring variable relationships
Seaborn comes in handy for visualizing variable-pair relationships and generating ideas for additional features.
import seaborn as sns
sns.pairplot(df_vars[["budget_parsed_adj", "opening_theaters", "actor_prev_rev", "domestic_revenue_adj"]])
I tried transforming the budget_parsed_adj
, and domestic_revenue_adj
, and actor_prev_rev
variables to log values:
df_vars.loc[:, "domestic_revenue_adj_log"] = np.log(df_vars.domestic_revenue_adj)
df_vars.loc[:, "budget_parsed_adj_log"] = np.log(df_vars.budget_parsed_adj)
df_vars.loc[:, 'actor_prev_rev_log'] = np.log(df_vars.loc[:, 'actor_prev_rev'])
sns.pairplot(df_vars[["budget_parsed_adj_log", "opening_theaters", "actor_prev_rev_log", "domestic_revenue_adj_log"]])
Starting to see some cleaner relationships!
Running models
Time to become the (automated) movie success whisperer! Running linear and ridge regressions produced the following results:
# Scaling variables
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
y = y / y.median()
from sklearn.linear_model import LinearRegression, RidgeCV
models = {'linreg': LinearRegression(), 'ridge': RidgeCV(alphas=[0.1, 1.0, 10.0])}
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)
for name, model in models.items():
model.fit(X_train, y_train)
print(name)
print(model.coef_)
# print(' Coefs:' + model.coef_)
print('Training Score: {}'.format(model.score(X_train, y_train)))
print('Test Score: {}'.format(model.score(X_test, y_test)))
linreg
[ 0.03938255 0.04354528 -0.00163238]
Training Score: 0.5068223807389381
Test Score: 0.3734715765598262
ridge
[ 0.03923569 0.04336833 -0.00150752]
Training Score: 0.5068157184060109
Test Score: 0.37462124220183046
In [232]:
Somewhat respectable fit scores:
Another approach: gradient boosting with genre data added
In search of a better model, I tried applying a gradient boosting technique, this time with movie genre data (encoded as one-hot variables):
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
X_onehot = df_vars_onehot
scaler_1 = preprocessing.StandardScaler().fit(X_onehot)
X_scaled_1 = scaler_1.transform(X_onehot)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_scaled_1, y)
gb = GradientBoostingRegressor()
param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
'max_depth': [3, 4, 5],
'min_samples_leaf': [3, 5, 10, 20],
# 'max_features': [1.0, 0.3, 0.1] ## not possible in our example (only 1 fx)
}
gs = GridSearchCV(gb, param_grid).fit(X_train_1, y_train_1)
gb_model = gs.best_estimator_
print('Training score: ' + str(gs.score(X_train_1, y_train_1)))
print('Test score: ' + str(gb_model.score(X_test_1, y_test_1)))
0.6382110233431133
0.54062209368206093
Visualizing shows a better fit as well:
y_predict_1 = gb_model.predict(X_test_1)
plot_line = np.linspace(0.6, 1.2, 100)
plt.xlabel('log(domestic_revenue) - actual')
plt.ylabel('log(domestic_revenue) - predicted')
plt.title('Revenue prediction - Gradient Boosting with actor data')
plt.plot(plot_line, plot_line, 'g')
plt.scatter(y_test_1, y_predict_1)
plt.show()
Key takeaways
- Acquiring data is....a PITA
- Models are black boxes until you take the time to understand the math and implementations
- Even a trivial blog setup can take a while to get up and running