import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier, plot_importance
from sklearn.linear_model import LogisticRegression
from scipy import stats
from sklearn.feature_selection import RFECV
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score, classification_report
from numpy import sort
from sklearn.feature_selection import SelectFromModel

# Loading the Data
dt = pd.read_csv('train.csv')

# Checking for Duplicates
dt.duplicated().sum()

np.int64(0)

# Checking for Null Values
dt.isnull().sum()

title                   2
country                 0
genres                  0
language                5
writer_count           94
title_adaption          0
censor_rating          38
release_date            4
runtime                 0
dvd_release_date       69
users_votes             0
comments              426
likes                 444
overall_views         317
dislikes              444
ratings_imdb            0
ratings_tomatoes        0
ratings_metacritic      0
special_award           0
awards_win              0
awards_nomination       0
revenue_category        0
dtype: int64

# Using Boxplot to check for Outliers
plt.subplot(2, 2, 1)
sns.boxplot(data=dt['comments'])
plt.title('Comments')

plt.subplot(2, 2, 2)
sns.boxplot(data=dt['likes'])
plt.title('Likes')

plt.subplot(2, 2, 3)
sns.boxplot(data=dt['overall_views'])
plt.title('Overall Views')

plt.subplot(2, 2, 4)
sns.boxplot(data=dt['dislikes'])
plt.title('Dislikes')

# Adjust layout
plt.tight_layout()

columns = ['comments', 'likes', 'overall_views', 'dislikes']
for column in columns:
    median = dt[column].median()
    dt.loc[dt[column].isna(), column] = median

columns1 = ['writer_count', 'censor_rating', 'dvd_release_date', 'language','release_date','title']
for column in columns1:
    mode = dt[column].mode()[0] 
    dt.loc[dt[column].isna(), column] = mode

# Checking to see if the Nulls have been removed
dt.isnull().sum()

title                 0
country               0
genres                0
language              0
writer_count          0
title_adaption        0
censor_rating         0
release_date          0
runtime               0
dvd_release_date      0
users_votes           0
comments              0
likes                 0
overall_views         0
dislikes              0
ratings_imdb          0
ratings_tomatoes      0
ratings_metacritic    0
special_award         0
awards_win            0
awards_nomination     0
revenue_category      0
dtype: int64

# Transform date columns to display the month, year, and day of the week
import warnings
with warnings.catch_warnings(record=True):
    dt = dt.assign(
        r_date=pd.to_datetime(dt['release_date']),
        dvd_r_date=pd.to_datetime(dt['dvd_release_date'])
    ).assign(
        r_year=lambda x: x['r_date'].dt.year,
        r_month=lambda x: x['r_date'].dt.month,
        r_day=lambda x: x['r_date'].dt.day_name(),
        dvd_r_year=lambda x: x['dvd_r_date'].dt.year,
        dvd_r_month=lambda x: x['dvd_r_date'].dt.month,
        dvd_r_day=lambda x: x['dvd_r_date'].dt.day_name()
    )

# Remove Comma in User-votes and Convert to Numeric
# Format Censor Rating Variable
# New featurees from  Language, Country  and Genre
# All number variables to Numeric type (Int64)
dt = dt.assign(
    users_votes=lambda x: x.users_votes.str.replace(",", "").astype("int64"),
    censor_rating=lambda x: x.censor_rating.str.upper().str.replace('UNRATED', 'NOT RATED'),
    country_no=lambda x: x.country.str.split(',').str.len(),
    genre_no=lambda x: x.genres.str.split(',').str.len(),
    lang_no=lambda x: x.language.str.split(',').str.len(),
    ratings_tomatoes=lambda x: x.ratings_tomatoes.str.replace("%", "").astype("int64"),
    ratings_metacritic=lambda x: x.ratings_metacritic.str.replace("/100", "").astype("int64"),
    ratings_imdb=lambda x: x.ratings_imdb.str.replace("/10", "").astype("float64"),
    runtime=lambda x: x.runtime.str.replace("min", "").astype("int64")
)

# Check the descriptive statistics 
dt.head(10).describe(exclude=["object", "bool"]).T

# Based on descriptives above, let's generate categorical variables from our numeric variables.
def categorize_column(df, column_name, bins, labels):
    """Categorizes a column into bins with labels."""
    df[f'{column_name}_cat'] = pd.cut(df[column_name], bins=bins, labels=labels)

# User Vote Categorized
categorize_column(dt, 'users_votes', [1, 7000, 29000, 94000, 2100000], ['< 7,000', '7,000 -29,000', '29,000 - 94,000', '> 94,000'])

# Runtime Categorized
categorize_column(dt, 'runtime', [1, 93, 102, 115, 566], ['< 93', '93 -102', '102-115', '> 115'])

# Comments Categorized
categorize_column(dt, 'comments', [1, 10, 57, 289, 44644], ['< 10', '10 - 57', '57 - 289', '> 289'])

# Likes Categorized
categorize_column(dt, 'likes', [1, 99, 425, 1819, 188526], ['< 99', '99 - 425', '425 - 1819', '> 1819'])

# Overall view Categorized
categorize_column(dt, 'overall_views', [1, 67529.5, 281652, 985509.5, 107150221], ['< 67529.5', '67529.5- 281652', '281652 - 985509.5', '> 985509.5'])

# Dislikes Categorized
categorize_column(dt, 'dislikes', [1, 7, 30, 121.25, 29267], ['< 7', '7-30', '30-122', '>122'])

# Ratings IMDb Categorized
categorize_column(dt, 'ratings_imdb', [1, 5.9, 6.6, 7.2, 9.0], ['< 5.9', '5.9-6.6', '6.6-7.2', '>7.2'])

# Ratings Tomatoes Categorized
categorize_column(dt, 'ratings_tomatoes', [1, 34.0, 62.0, 82.0, 100.0], ['< 34', '34-62', '62-82', '>82'])

# Ratings Metacritic Categorized
categorize_column(dt, 'ratings_metacritic', [1, 41.0, 56.0, 69.0, 100.0], ['< 41', '41-56', '56-69', '>69'])

# Award Nomination Categorized
categorize_column(dt, 'awards_nomination', [0, 1.0, 5.0, 12.0, 326.0], ['< 1', '1-5', '5-12', '>12'])

# Released Year Categorized
categorize_column(dt, 'r_year', [1970, 2005, 2009, 2012, 2072], ['< 2005', '2005-2009', '2009-2012', '>2012'])

# Checking the distribution of the dependent variable
plt.figure(figsize=(12,8))
ax=sns.countplot(x='revenue_category', data=dt)
plt.bar_label(ax.containers[0]) 
plt.title('Distribution of the dependent variable')
plt.show()

# Checking the distribution of the Censor Rating Against Revenue Category
pd.crosstab(dt['censor_rating'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Censor Rating Against Revenue Category');

# Checking the distribution of the Release Year Against Revenue Category
pd.crosstab(dt['r_year_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Release Year Against Revenue Category');

# Checking the distribution of the User Votes Against Revenue Category
pd.crosstab(dt['users_votes_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('User Votes gainst Revenue Category');

# Checking the distribution of the Runtime Against Revenue Category
pd.crosstab(dt['runtime_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Runtime against Revenue Category');

# Checking the distribution of the Comments Against Revenue Category
pd.crosstab(dt['comments_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Comments against Revenue Category');

# Checking the distribution of the Likes Against Revenue Category
pd.crosstab(dt['likes_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Likes against Revenue Category');

# Checking the distribution of the Overall Views Against Revenue Category
pd.crosstab(dt['overall_views_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Overall Views against Revenue Category');

# Checking the distribution of the Dislikes Against Revenue Category
pd.crosstab(dt['dislikes_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Dislikes against Revenue Category');

# Checking the distribution of the ratings_imdb Against Revenue Category
pd.crosstab(dt['ratings_imdb_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Ratings Imdb Against Revenue Category');

# Checking the distribution of the ratings_tomatoes Against Revenue Category
pd.crosstab(dt['ratings_tomatoes_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Ratings Tomatoes against Revenue Category');

# Checking the distribution of the ratings_metacritic Against Revenue Category
pd.crosstab(dt['ratings_metacritic_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Ratings Metacritic against Revenue Category');

# Checking the distribution of the awards_nomination Against Revenue Category
pd.crosstab(dt['awards_nomination_cat'], dt['revenue_category'], normalize = 'index').plot.bar(stacked = False);plt.title('Awards nomination against Revenue Category');

# Correlation Analysis
dt2 = dt.select_dtypes(include=np.number)
corr = dt2.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(20, 15))
sns.heatmap(corr, mask=mask, cmap='coolwarm', annot=True, fmt=".2f",
            square=True, linewidths=.5, cbar_kws={"shrink": .75},
            xticklabels=corr.columns, yticklabels=corr.columns)

<Axes: >

# Remove mot needed variables
dt = dt.drop(['title','ratings_imdb', 'ratings_tomatoes','ratings_metacritic','release_date','dvd_release_date' ,'awards_nomination', 'r_date','dvd_r_date',
'users_votes','comments','likes' ,'dislikes','overall_views', 'runtime','r_year'], axis=1)

# Setting the preprocessing instance
le = preprocessing.LabelEncoder()

# transform variable to categorical
dt.country=le.fit_transform(dt.country)
dt.genres=le.fit_transform(dt.genres)
dt.language=le.fit_transform(dt.language)
dt.censor_rating=le.fit_transform(dt.censor_rating)
dt.revenue_category =le.fit_transform(dt.revenue_category)
dt.r_day =le.fit_transform(dt.r_day )
dt.dvd_r_day=le.fit_transform(dt.dvd_r_day)
dt.users_votes_cat=le.fit_transform(dt.users_votes_cat)
dt.awards_nomination_cat=le.fit_transform(dt.awards_nomination_cat)
dt.r_year_cat=le.fit_transform(dt.r_year_cat)
dt.ratings_metacritic_cat=le.fit_transform(dt.ratings_metacritic_cat)
dt.ratings_tomatoes_cat=le.fit_transform(dt.ratings_tomatoes_cat)
dt.ratings_imdb_cat=le.fit_transform(dt.ratings_imdb_cat)
dt.dislikes_cat=le.fit_transform(dt.dislikes_cat)
dt.overall_views_cat=le.fit_transform(dt.overall_views_cat)
dt.likes_cat=le.fit_transform(dt.likes_cat)
dt.comments_cat=le.fit_transform(dt.comments_cat)
dt.runtime_cat=le.fit_transform(dt.runtime_cat)
dt.title_adaption=le.fit_transform(dt.title_adaption)

# Extracting the dependent and independent variables
X = dt.drop(["revenue_category"],axis=1)
y = dt['revenue_category']

# Extracting the Train, Test and scaling the train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
sc = StandardScaler()
X_train_s = sc.fit_transform(X_train)
X_test_s = sc.transform(X_test)

# Feature Selection using Xgboost
xgb_f=XGBClassifier(n_estimators=100,learning_rate=0.1, random_state=101)
rfecv=xgb_f.fit(X_train,y_train)
y_pred = rfecv.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
thresholds = sort(rfecv.feature_importances_)
import warnings
with warnings.catch_warnings(record=True):
    for thresh in thresholds:
    # select features using threshold
        selection = SelectFromModel(rfecv, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)
        # train model
        selection_model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=101)
        selection_model.fit(select_X_train, y_train)
        # eval model
        select_X_test = selection.transform(X_test)
        predictions = selection_model.predict(select_X_test)
        accuracy = accuracy_score(y_test, predictions)
        print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))

Accuracy: 0.8653
Thresh=0.000, n=27, Accuracy: 86.53%
Thresh=0.010, n=26, Accuracy: 86.53%
Thresh=0.012, n=25, Accuracy: 87.08%
Thresh=0.014, n=24, Accuracy: 86.25%
Thresh=0.014, n=23, Accuracy: 86.53%
Thresh=0.015, n=22, Accuracy: 86.94%
Thresh=0.015, n=21, Accuracy: 86.81%
Thresh=0.016, n=20, Accuracy: 86.25%
Thresh=0.016, n=19, Accuracy: 87.08%
Thresh=0.018, n=18, Accuracy: 85.83%
Thresh=0.018, n=17, Accuracy: 85.83%
Thresh=0.018, n=16, Accuracy: 85.69%
Thresh=0.018, n=15, Accuracy: 85.97%
Thresh=0.020, n=14, Accuracy: 85.69%
Thresh=0.020, n=13, Accuracy: 85.69%
Thresh=0.022, n=12, Accuracy: 85.97%
Thresh=0.025, n=11, Accuracy: 85.83%
Thresh=0.026, n=10, Accuracy: 86.11%
Thresh=0.029, n=9, Accuracy: 85.69%
Thresh=0.031, n=8, Accuracy: 85.42%
Thresh=0.031, n=7, Accuracy: 85.83%
Thresh=0.034, n=6, Accuracy: 85.28%
Thresh=0.039, n=5, Accuracy: 83.33%
Thresh=0.042, n=4, Accuracy: 83.33%
Thresh=0.052, n=3, Accuracy: 84.17%
Thresh=0.056, n=2, Accuracy: 82.78%
Thresh=0.390, n=1, Accuracy: 77.50%

# Plotting the features
plot_importance(rfecv)

<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>

#Extracting the top 19 features
feature_importances =rfecv.feature_importances_
features_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': feature_importances
})

features_df = features_df.sort_values(by='importance', ascending=False)
top_19_features = features_df.head(19)['feature'].tolist()
print(top_19_features)

['users_votes_cat', 'censor_rating', 'country', 'dvd_r_year', 'language', 'r_day', 'ratings_imdb_cat', 'ratings_tomatoes_cat', 'awards_win', 'ratings_metacritic_cat', 'writer_count', 'country_no', 'awards_nomination_cat', 'special_award', 'genres', 'overall_views_cat', 'dislikes_cat', 'r_month', 'dvd_r_month']

#Using the top 19 features for our final train and test set
X_train_2=X_train[top_19_features]
X_test_2=X_test[top_19_features]

#Fitting random forest Model with 64 estimators
rf_classifier= RandomForestClassifier(n_estimators = 64, criterion = 'entropy', random_state = 101)
rf_model=rf_classifier.fit(X_train_2,y_train)
rf_pred = rf_model.predict(X_test_2)
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       384
           1       0.86      0.87      0.87       336

    accuracy                           0.88       720
   macro avg       0.87      0.87      0.87       720
weighted avg       0.88      0.88      0.88       720

#Fitting Xgboost Model
xgb_classifier=XGBClassifier(random_state=101,learning_rate= 0.1, max_depth=6, n_estimators=64)
xgb_model=xgb_classifier.fit(X_train_2,y_train)
xgb_pred = xgb_model.predict(X_test_2)
print(classification_report(y_test,xgb_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87       384
           1       0.84      0.87      0.86       336

    accuracy                           0.86       720
   macro avg       0.86      0.86      0.86       720
weighted avg       0.86      0.86      0.86       720

#Fitting Logistic Regression Model
import warnings
with warnings.catch_warnings(record=True):
    logistic_regressor = LogisticRegression()
    logmodel=logistic_regressor.fit(X_train_2,y_train)
    log_pred = logmodel.predict(X_test_2)
    print(classification_report(y_test, log_pred))

              precision    recall  f1-score   support

           0       0.76      0.72      0.74       384
           1       0.70      0.74      0.72       336

    accuracy                           0.73       720
   macro avg       0.73      0.73      0.73       720
weighted avg       0.73      0.73      0.73       720

Implementing the Model on Test Data Set

#Loading the Test set
dt3 = pd.read_csv('test.csv')

#Checking Duplicated Values
dt3.duplicated().sum()

np.int64(0)

#Checking for Null Values
dt3.isnull().sum()

title                   0
country                 0
genres                  0
language                0
writer_count           29
title_adaption          0
censor_rating          16
release_date            2
runtime                 0
dvd_release_date       18
users_votes             0
comments              124
likes                 123
overall_views          93
dislikes              123
ratings_imdb            0
ratings_tomatoes        0
ratings_metacritic      0
special_award           0
awards_win              0
awards_nomination       0
dtype: int64

# Filling NA in 'comments', 'likes', 'overall_views', 'dislikes' variables with median
columns = ['comments', 'likes', 'overall_views', 'dislikes']
for column in columns:
    median = dt3[column].median()
    dt3.loc[dt3[column].isna(), column] = median

# filling NA in 'writer_count', 'censor_rating', 'dvd_release_date', 'language','release_date' with mode
columns1 = ['writer_count', 'censor_rating', 'dvd_release_date', 'language','release_date']
for column in columns1:
    mode = dt3[column].mode()[0] 
    dt3.loc[dt3[column].isna(), column] = mode

import warnings
with warnings.catch_warnings(record=True):
#Transform date columns to display the month, year, and day of the week
    dt3 = dt3.assign(
        r_date=pd.to_datetime(dt3['release_date']),
        dvd_r_date=pd.to_datetime(dt3['dvd_release_date'])
    ).assign(
        r_year=lambda x: x['r_date'].dt.year,
        r_month=lambda x: x['r_date'].dt.month,
        r_day=lambda x: x['r_date'].dt.day_name(),
        dvd_r_year=lambda x: x['dvd_r_date'].dt.year,
        dvd_r_month=lambda x: x['dvd_r_date'].dt.month,
        dvd_r_day=lambda x: x['dvd_r_date'].dt.day_name()
    )

import warnings
with warnings.catch_warnings(record=True):
    # Remove Comma in User-votes and Convert to Numeric
    # Format Censor Rating Variable
    # New featurees from  Language, Country  and Genre
    # All number variables to Numeric type (Int64)
    dt3 = dt3.assign(
        users_votes=lambda x: x.users_votes.str.replace(",", "").astype("int64"),
        censor_rating=lambda x: x.censor_rating.str.upper().str.replace('UNRATED', 'NOT RATED'),
        country_no=lambda x: x.country.str.split(',').str.len(),
        genre_no=lambda x: x.genres.str.split(',').str.len(),
        lang_no=lambda x: x.language.str.split(',').str.len(),
        ratings_tomatoes=lambda x: x.ratings_tomatoes.str.replace("%", "").astype("int64"),
        ratings_metacritic=lambda x: x.ratings_metacritic.str.replace("/100", "").astype("int64"),
        ratings_imdb=lambda x: x.ratings_imdb.str.replace("/10", "").astype("float64"),
        runtime=lambda x: x.runtime.str.replace("min", "").astype("int64")
)

#Binning numerical variables
def categorize_column(df, column_name, bins, labels):
    """Categorizes a column into bins with labels."""
    df[f'{column_name}_cat'] = pd.cut(df[column_name], bins=bins, labels=labels)

# User Vote Categorized
categorize_column(dt3, 'users_votes', [1, 7000, 29000, 94000, 2100000], ['< 7,000', '7,000 -29,000', '29,000 - 94,000', '> 94,000'])

# Runtime Categorized
categorize_column(dt3, 'runtime', [1, 93, 102, 115, 566], ['< 93', '93 -102', '102-115', '> 115'])

# Comments Categorized
categorize_column(dt3, 'comments', [1, 10, 57, 289, 44644], ['< 10', '10 - 57', '57 - 289', '> 289'])

# Likes Categorized
categorize_column(dt3, 'likes', [1, 99, 425, 1819, 188526], ['< 99', '99 - 425', '425 - 1819', '> 1819'])

# Overall view Categorized
categorize_column(dt3, 'overall_views', [1, 67529.5, 281652, 985509.5, 107150221], ['< 67529.5', '67529.5- 281652', '281652 - 985509.5', '> 985509.5'])

# Dislikes Categorized
categorize_column(dt3, 'dislikes', [1, 7, 30, 121.25, 29267], ['< 7', '7-30', '30-122', '>122'])

# Ratings IMDb Categorized
categorize_column(dt3, 'ratings_imdb', [1, 5.9, 6.6, 7.2, 9.0], ['< 5.9', '5.9-6.6', '6.6-7.2', '>7.2'])

# Ratings Tomatoes Categorized
categorize_column(dt3, 'ratings_tomatoes', [1, 34.0, 62.0, 82.0, 100.0], ['< 34', '34-62', '62-82', '>82'])

# Ratings Metacritic Categorized
categorize_column(dt3, 'ratings_metacritic', [1, 41.0, 56.0, 69.0, 100.0], ['< 41', '41-56', '56-69', '>69'])

# Award Nomination Categorized
categorize_column(dt3, 'awards_nomination', [0, 1.0, 5.0, 12.0, 326.0], ['< 1', '1-5', '5-12', '>12'])

# Released Year Categorized
categorize_column(dt3, 'r_year', [1970, 2005, 2009, 2012, 2072], ['< 2005', '2005-2009', '2009-2012', '>2012'])

# Setting the preprocessing instance
le = preprocessing.LabelEncoder()
# transform variable to categorical
dt3.country=le.fit_transform(dt3.country)
dt3.genres=le.fit_transform(dt3.genres)
dt3.language=le.fit_transform(dt3.language)
dt3.censor_rating=le.fit_transform(dt3.censor_rating)
dt3.r_day =le.fit_transform(dt3.r_day )
dt3.dvd_r_day=le.fit_transform(dt3.dvd_r_day)
dt3.users_votes_cat=le.fit_transform(dt3.users_votes_cat)
dt3.awards_nomination_cat=le.fit_transform(dt3.awards_nomination_cat)
dt3.r_year_cat=le.fit_transform(dt3.r_year_cat)
dt3.ratings_metacritic_cat=le.fit_transform(dt3.ratings_metacritic_cat)
dt3.ratings_tomatoes_cat=le.fit_transform(dt3.ratings_tomatoes_cat)
dt3.ratings_imdb_cat=le.fit_transform(dt3.ratings_imdb_cat)
dt3.dislikes_cat=le.fit_transform(dt3.dislikes_cat)
dt3.overall_views_cat=le.fit_transform(dt3.overall_views_cat)
dt3.likes_cat=le.fit_transform(dt3.likes_cat)
dt3.comments_cat=le.fit_transform(dt3.comments_cat)
dt3.runtime_cat=le.fit_transform(dt3.runtime_cat)
dt3.title_adaption=le.fit_transform(dt3.title_adaption)

#Selecting top 19 features used in training the model
dt3_clean=dt3[top_19_features]

# making final prediction on our test data set
test_final =rf_model.predict(dt3_clean)

# Combining the predicted values with the title column and export it to a csv file
import warnings
with warnings.catch_warnings(record=True):
    final_submissions=dt3[['title']]
    final_submissions['revenue_category']=test_final
    Rev = {0:'High', 1: 'Low'}
    final_submissions['revenue_category'] = final_submissions['revenue_category'].map(Rev)

#Checking the top 5 obs of our final prediction
final_submissions.head()

	count	mean	min	25%	50%	75%	max	std
writer_count	10.0	2.1	1.0	1.0	2.0	2.75	4.0	1.197219
runtime	10.0	105.7	68.0	91.5	95.0	115.75	160.0	27.137715
users_votes	10.0	99582.2	5498.0	16903.25	35826.5	91285.5	439998.0	143627.319725
comments	10.0	84.5	1.0	45.0	57.0	107.0	268.0	79.630047
likes	10.0	598.5	56.0	371.0	425.0	673.25	1646.0	490.771558
overall_views	10.0	381275.9	54156.0	281652.0	316387.5	499384.75	970306.0	260279.535264
dislikes	10.0	30.2	9.0	21.0	30.0	35.25	57.0	14.800901
ratings_imdb	10.0	6.63	5.1	6.125	6.65	7.05	8.1	0.895731
ratings_tomatoes	10.0	56.4	25.0	38.75	59.0	76.75	85.0	22.31193
ratings_metacritic	10.0	43.7	0.0	33.5	53.0	62.25	76.0	26.259601
special_award	10.0	0.1	0.0	0.0	0.0	0.0	1.0	0.316228
awards_win	10.0	6.1	0.0	0.25	3.0	9.75	22.0	7.978443
awards_nomination	10.0	18.3	0.0	1.5	4.5	13.75	92.0	30.528857
r_date	10	2006-10-07 02:24:00	2002-04-12 00:00:00	2004-01-16 00:00:00	2006-03-20 00:00:00	2009-05-01 00:00:00	2013-05-31 00:00:00	NaN
dvd_r_date	10	2007-03-20 14:24:00	2002-08-20 00:00:00	2004-05-02 06:00:00	2006-09-29 12:00:00	2009-09-08 00:00:00	2013-07-15 00:00:00	NaN
r_year	10.0	2006.3	2002.0	2003.25	2006.0	2008.75	2013.0	3.713339
r_month	10.0	6.0	1.0	4.0	6.0	8.75	10.0	3.265986
dvd_r_year	10.0	2006.8	2002.0	2004.25	2006.5	2008.75	2013.0	3.359894
dvd_r_month	10.0	5.4	1.0	2.0	5.5	7.75	12.0	3.893014
country_no	10.0	1.1	1.0	1.0	1.0	1.0	2.0	0.316228
genre_no	10.0	3.4	2.0	2.25	3.0	4.0	6.0	1.349897
lang_no	10.0	1.6	1.0	1.0	1.0	2.5	3.0	0.966092

Introduction¶

importing the required packages¶

Feature Engineering¶

Conclusion¶

	title	revenue_category
0	Delhi-6	Low
1	Before I Disappear	Low
2	Good Year, A	High
3	Brüno	High
4	How to Lose a Guy in 10 Days	High