# Initialize Otter
import otter
grader = otter.Notebook("final-project.ipynb")


# Run this cell to set up the notebook, but please don't change it.
import numpy as np
import math
from datascience import *

# These lines set up the plotting functionality and formatting.
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)


movies = Table.read_table('data/movies.csv')
movies.where("Title", "wild wild west").select(0, 1, 2, 3, 4, 14, 49, 1042, 4004)


title_index = movies.index_by('Title')
def row_for_title(title):
    """Return the row for a title, similar to the following expression (but faster)
    
    movies.where('Title', title).row(0)
    """
    return title_index.get(title)[0]

row_for_title('the terminator')


row_for_title('the terminator').item('none')

0.0009633911368015


# Set row_sum to a number that's the (approximate) sum of each row of word proportions.
expected_row_sum = 1
expected_row_sum

1


grader.check("q1_0")

q1_0


print('Words with frequencies:', movies.drop(np.arange(5)).num_columns) 
print('Movies with genres:', movies.num_rows)

Words with frequencies: 5000
Movies with genres: 370


# Just run this cell.
vocab_mapping = Table.read_table('data/stem.csv')
stemmed = np.take(movies.labels, np.arange(3, len(movies.labels)))
vocab_table = Table().with_column('Stem', stemmed).join('Stem', vocab_mapping)
vocab_table.take(np.arange(1100, 1110))


stemmed_message = vocab_table.where('Word', 'vegetables').column(0).item(0)
stemmed_message

'veget'


grader.check("q1_1_1")

q1_1_1


most_stem = vocab_table.group('Stem').sort('count', descending=True).column(0).item(0)
most_stem

'gener'


grader.check("q1_1_2")

q1_1_2


# In our solution, we found it useful to first add columns with
# the length of the word and the length of the stem,
# and then to add a column with the difference between those lengths.
# What will the difference be if the word is not shortened?

len_stem = make_array()
for i in vocab_table.column(0):
    len_stem = np.append(len_stem, len(i))
    
len_word = make_array()
for i in vocab_table.column(1):
    len_word = np.append(len_word, len(i))

tbl_with_lens = vocab_table.with_columns('stem length', len_stem, 'word length', len_word)
tbl_with_dif = tbl_with_lens.with_column('difference', tbl_with_lens.column('word length') - tbl_with_lens.column('stem length'))

longest_uncut = tbl_with_dif.where('difference', 0).sort('word length', descending = True).column(1).item(1)
longest_uncut

'misunderstand'


grader.check("q1_1_3")

q1_1_3


# Just run this cell!
outer_space = movies.select("outer", "space")
outer_space.scatter("outer", "space")
plots.axis([-0.001, 0.0025, -0.001, 0.005]);
plots.xticks(rotation=45);


# Our solution took multiple lines
# these two arrays should make your code cleaner!
outer = movies.column("outer")
space = movies.column("space")

outer_su = (outer - np.mean(outer)) / np.std(outer)
space_su = (space - np.mean(space)) / np.std(space)

outer_space_r = np.mean(outer_su * space_su)
outer_space_r

0.2829527833012746


grader.check("q1_2_1")

q1_2_1


word_x = "soldier"
word_y = "war"

# These arrays should make your code cleaner!
arr_x = movies.column(word_x)
arr_y = movies.column(word_y)

x_su = (arr_x - np.mean(arr_x)) / np.std(arr_x)
y_su = (arr_y - np.mean(arr_y)) / np.std(arr_y)

r = np.mean(x_su * y_su)

slope = r * np.std(arr_y) / np.std(arr_x)
intercept = np.mean(arr_y) - slope * np.mean(arr_x)

# DON'T CHANGE THESE LINES OF CODE
movies.scatter(word_x, word_y)
max_x = max(movies.column(word_x))
plots.title(f"Correlation: {r}, magnitude greater than .2: {abs(r) >= 0.2}")
plots.plot([0, max_x * 1.3], [intercept, intercept + slope * (max_x*1.3)], color='gold');


# Here we have defined the proportion of our data
# that we want to designate for training as 17/20ths
# of our total dataset.  3/20ths of the data is
# reserved for testing.

training_proportion = 17/20

num_movies = movies.num_rows
num_train = int(num_movies * training_proportion)
num_test = num_movies - num_train

train_movies = movies.take(np.arange(num_train))
test_movies = movies.take(np.arange(num_train, num_movies))

print("Training: ",   train_movies.num_rows, ";",
      "Test: ",       test_movies.num_rows)

Training:  314 ; Test:  56


def comedy_proportion(table):
    # Return the proportion of movies in a table that have the Comedy genre.
    return table.where('Genre', 'comedy').num_rows / table.num_rows

dataset_array = make_array('Training', 'Test')
proportions_array = make_array(comedy_proportion(train_movies), comedy_proportion(test_movies))
comedy_proportions = Table().with_columns('Dataset', dataset_array, 'Proportion', proportions_array)
comedy_proportions.barh('Dataset', 'Proportion')
# Your solution may take multiple lines.  Start by creating a table.
# If you get stuck, think about what sort of table you need for barh to work


# Just run this cell.
def plot_with_two_features(test_movie, training_movies, x_feature, y_feature):
    """Plot a test movie and training movies using two features."""
    test_row = row_for_title(test_movie)
    distances = Table().with_columns(
            x_feature, [test_row.item(x_feature)],
            y_feature, [test_row.item(y_feature)],
            'Color',   ['unknown'],
            'Title',   [test_movie]
        )
    for movie in training_movies:
        row = row_for_title(movie)
        distances.append([row.item(x_feature), row.item(y_feature), row.item('Genre'), movie])
    distances.scatter(x_feature, y_feature, group='Color', labels='Title', s=30)
    
training = ["clerks.", "the avengers"] 
plot_with_two_features("monty python and the holy grail", training, "water", "feel")
plots.axis([-0.001, 0.0011, -0.004, 0.008]);


title_index = movies.index_by('Title')
python = row_for_title("monty python and the holy grail") 
avengers = row_for_title("the avengers") 

one_distance = ((python.item('water') - avengers.item('water'))**2 + (python.item('feel')- avengers.item('feel'))**2) ** 0.5
one_distance

0.0008050869157478146


grader.check("q2_1_1")

q2_1_1


training = ["clerks.", "the avengers", "the silence of the lambs"] 
plot_with_two_features("monty python and the holy grail", training, "water", "feel") 
plots.axis([-0.001, 0.0011, -0.004, 0.008]);


def distance_two_features(title0, title1, x_feature, y_feature):
    """Compute the distance between two movies with titles title0 and title1
    
    Only the features named x_feature and y_feature are used when computing the distance.
    """
    row0 = row_for_title(title0)
    row1 = row_for_title(title1)
    return ((row0.item(x_feature) - row1.item(x_feature))**2 + (row0.item(y_feature) - row1.item(y_feature))**2) ** 0.5

for movie in make_array("clerks.", "the silence of the lambs"):
    movie_distance = distance_two_features(movie, "monty python and the holy grail", "water", "feel")
    print(movie, 'distance:\t', movie_distance)

clerks. distance:	 0.0007983810687227716
the silence of the lambs distance:	 0.00022256314855564847


grader.check("q2_1_2")

q2_1_2


def distance_from_python(title):
    """The distance between the given movie and "monty python and the holy grail", 
    based on the features "water" and "feel".
    
    This function takes a single argument:
      title: A string, the name of a movie.
    """
    return distance_two_features("monty python and the holy grail", title, "water", "feel")


grader.check("q2_1_3")

q2_1_3


# Your solution may take multiple lines.

movies_from_python = make_array()
movie_names = train_movies.column('Title')
for i in movie_names:
    movies_from_python = np.append(movies_from_python, distance_from_python(i))
      
close_movies = train_movies.with_column("distance from python", movies_from_python).select("Title", "Genre", "water", "feel", "distance from python").sort("distance from python").take(np.arange(5))
close_movies


grader.check("q2_1_4")

q2_1_4


def most_common(label, table):
    """The most common element in a column of a table.
    
    This function takes two arguments:
      label: The label of a column, a string.
      table: A table.
     
    It returns the most common value in that column of that table.
    In case of a tie, it returns any one of the most common values
    """
    return table.group(label).sort('count', descending=True).column(0).item(0)

# Calling most_common on your table of 5 nearest neighbors classifies
# "monty python and the holy grail" as a thriller movie, 3 votes to 2. 
most_common('Genre', close_movies)

'thriller'


grader.check("q2_1_5")

q2_1_5


def distance(features_array1, features_array2):
    """The Euclidean distance between two arrays of feature values."""
    return np.sqrt(np.sum((features_array1 - features_array2)**2))

first_movie = np.array(train_movies.drop(np.arange(0, 5)).row(0)) 
second_movie = np.array(train_movies.drop(np.arange(0, 5)).row(1))

distance_first_to_second = distance(first_movie, second_movie)
distance_first_to_second

0.03335446890881317


grader.check("q3_0")

q3_0


bottom_left = 4


grader.check("q3_1_1")

q3_1_1


bottom_right = 3


grader.check("q3_1_2")

q3_1_2


top_right = 1


grader.check("q3_1_3")

q3_1_3


top_left = 2


grader.check("q3_1_4")

q3_1_4


movie_genre_guess = 2


grader.check("q3_1_5")

q3_1_5


# Set my_features to an array of 20 features (strings that are column labels)

my_features = make_array('kill', 'mari', 'uh', 'well', 'dead', 'love', 'realli', 'great', 'yeah', 'gun')

# Select the 20 features of interest from both the train and test sets
train_my_features = train_movies.select(my_features)
test_my_features = test_movies.select(my_features)


grader.check("q3_1_6")

q3_1_6


print("Movie:")
test_movies.take(0).select('Title', 'Genre').show()
print("Features:")
test_my_features.take(0).show()

Movie:

Features:


# Just run this cell to define fast_distances.

def fast_distances(test_row, train_table):
    """Return an array of the distances between test_row and each row in train_rows.

    Takes 2 arguments:
      test_row: A row of a table containing features of one
        test movie (e.g., test_my_features.row(0)).
      train_table: A table of features (for example, the whole
        table train_my_features)."""
    assert train_table.num_columns < 50, "Make sure you're not using all the features of the movies table."
    counts_matrix = np.asmatrix(train_table.columns).transpose()
    diff = np.tile(np.array(list(test_row)), [counts_matrix.shape[0], 1]) - counts_matrix
    np.random.seed(0) # For tie breaking purposes
    distances = np.squeeze(np.asarray(np.sqrt(np.square(diff).sum(1))))
    eps = np.random.uniform(size=distances.shape)*1e-10 #Noise for tie break
    distances = distances + eps
    return distances


# Your solution may take multiple lines of code.
genre_and_distances = train_movies.with_column('Distance', fast_distances(test_my_features.row(0), train_my_features)).select('Genre', 'Distance').sort('Distance', descending=False)
genre_and_distances


grader.check("q3_1_8")

q3_1_8


# Set my_assigned_genre to the most common genre among these.
closest_7 = genre_and_distances.take(np.arange(7))

my_assigned_genre = most_common('Genre', closest_7)

# Set my_assigned_genre_was_correct to True if my_assigned_genre
# matches the actual genre of the first movie in the test set.
my_assigned_genre_was_correct = False

print("The assigned genre, {}, was{}correct.".format(my_assigned_genre, " " if my_assigned_genre_was_correct else " not "))

The assigned genre, comedy, was not correct.


grader.check("q3_1_9")

q3_1_9


def classify(test_row, train_rows, train_labels, k):
    """Return the most common class among k nearest neigbors to test_row."""
    distances = fast_distances(test_row, train_rows)
    genre_and_distances = Table().with_columns('Genre', train_labels, 'Distance', distances).select('Genre', 'Distance').sort('Distance', descending=False)
    return most_common('Genre', genre_and_distances.take(np.arange(k)))


grader.check("q3_2_1")

q3_2_1


# Define a row called tron_features.
tron_features = test_movies.where('Title', are.equal_to('tron')).select(my_features).row(0)
tron_genre = classify(tron_features, train_my_features, train_movies.column('Genre'), 13)
tron_genre

'comedy'


grader.check("q3_2_2")

q3_2_2


def classify_feature_row(row):
    return classify(row, train_my_features, train_movies.column('Genre'), 13)

# When you're done, this should produce 'Thriller' or 'Comedy'.
classify_feature_row(test_my_features.row(0))

'thriller'


grader.check("q3_2_3")

q3_2_3


test_guesses = test_my_features.apply(classify_feature_row)

proportion_correct =np.sum(test_movies.column('Genre') == test_guesses) / 56
proportion_correct

0.75


grader.check("q3_3_1")

q3_3_1


# Feel free to use multiple lines of code
# but make sure to assign test_movie_correctness to the proper table!

test_movie_correctness = test_my_features.with_columns('Guess', test_guesses, 'Genre', test_movies.column('Genre'), 'Title', test_movies.column('Title')).select('Title', 'Genre', 'Guess')
test_movie_correctness = test_movie_correctness.with_column('Was correct', test_movie_correctness.column('Genre') == test_movie_correctness.column('Guess')).drop('Guess')
test_movie_correctness
test_movie_correctness.sort('Was correct', descending = True).show(56)


grader.check("q3_3_2")

q3_3_2


# To start you off, here's a list of possibly-useful features
# Feel free to add or change this array to improve your classifier
new_features = make_array("laugh", "marri", "dead", "heart", "cop", 'kill', 'uh', 'well', 'love', 'realli', 'great', 'yeah', 'gun')

train_new = train_movies.select(new_features)
test_new = test_movies.select(new_features)

def another_classifier(row):
    return classify(row, train_new, train_movies.column('Genre'), 20)

new_guesses = test_new.apply(another_classifier)

new_correct = np.sum(test_movies.column('Genre') == new_guesses) / 56
new_correct

0.7857142857142857

Stem	Word
bond	bonding
bone	bone
bone	boning
bone	bones
bonu	bonus
book	bookings
book	books
book	booking
book	booked
book	book

Title	Genre	water	feel	distance from python
alien	thriller	0.00070922	0.00124113	0.000193831
tomorrow never dies	thriller	0.000888889	0.000888889	0.00020189
the silence of the lambs	thriller	0.000595948	0.000993246	0.000222563
innerspace	comedy	0.000522193	0.00104439	0.00028324
some like it hot	comedy	0.000528541	0.000951374	0.00030082

1. The Dataset¶

Question 1.0¶

1.1. Word Stemming¶

Question 1.1.1¶

Question 1.1.2¶

Question 1.1.3¶

1.2. Exploratory Data Analysis: Linear Regression¶

Question 1.2.1¶

Question 1.2.2¶

1.3. Splitting the dataset¶

Question 1.3.1¶

2. K-Nearest Neighbors - A Guided Example¶

2.1. Classifying a movie¶

Question 2.1.1¶

Question 2.1.2¶

Question 2.1.3¶

Question 2.1.4¶

Question 2.1.5¶

3. Features¶

Question 3.0¶

3.1. Creating your own feature set¶

Question 3.1.1¶

Question 3.1.6¶

Question 3.1.7¶

Question 3.1.8¶

Question 3.1.9¶

3.2. A classifier function¶

Question 3.2.1¶

Question 3.2.2¶

Question 3.2.3¶

3.3. Evaluating your classifier¶

4. Explorations¶

Question 4.1¶

Genre	Distance
comedy	0.00153825
comedy	0.0016134
comedy	0.00190262
comedy	0.00192253
thriller	0.0021041
comedy	0.00215782
thriller	0.00218006
comedy	0.00242911
thriller	0.00244957
thriller	0.00245682

Title	Genre	Was correct
new nightmare	thriller	True
the body snatcher	thriller	True
godzilla	thriller	True
rear window	thriller	True
u turn	thriller	True
jason goes to hell: the final friday	thriller	True
the crow: salvation	thriller	True
ed wood	comedy	True
storytelling	comedy	True
halloween h20: 20 years later	thriller	True
gone in sixty seconds	thriller	True
the butterfly effect	thriller	True
edtv	comedy	True
black rain	thriller	True
bringing out the dead	thriller	True
basic	thriller	True
detroit rock city	comedy	True
panic room	thriller	True
juno	comedy	True
my girl 2	comedy	True
nick of time	thriller	True
airplane ii: the sequel	comedy	True
suburbia	comedy	True
body of evidence	thriller	True
twelve monkeys	thriller	True
the game	thriller	True
sleepy hollow	thriller	True
his girl friday	comedy	True
mulholland dr.	thriller	True
spare me	thriller	True
annie hall	comedy	True
jackie brown	thriller	True
monty python and the holy grail	comedy	True
star trek: the wrath of khan	thriller	True
batman returns	thriller	True
suspect zero	thriller	True
sphere	thriller	True
o brother where art thou?	comedy	True
what lies beneath	thriller	True
wonder boys	comedy	True
the war of the worlds	thriller	True
what women want	comedy	True
the grifters	thriller	False
smoke	comedy	False
mystery of the wax museum	thriller	False
fast times at ridgemont high	comedy	False
the fifth element	thriller	False
hannibal	thriller	False
misery	thriller	False
smokey and the bandit	comedy	False
backdraft	thriller	False
tron	thriller	False
happy birthday wanda june	comedy	False
cruel intentions	thriller	False
the thin man	comedy	False
three kings	comedy	False