import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression


df = pd.read_excel("https://github.com/HuntsW/DataScienceClass/blob/main/social-ads-raw.xlsx?raw=true")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   segment    200 non-null    int64  
 1   google     200 non-null    float64
 2   facebook   200 non-null    float64
 3   instagram  200 non-null    float64
 4   sales      200 non-null    float64
 5   size       200 non-null    object 
 6   area       200 non-null    object 
dtypes: float64(4), int64(1), object(2)
memory usage: 11.1+ KB


fig, subs = plt.subplots(1, 3, figsize=(15,8))

sns.regplot(data=df, x='google', y='sales', ax=subs[0])
sns.regplot(data=df, x='facebook', y='sales', ax=subs[1])
sns.regplot(data=df, x='instagram', y='sales', ax=subs[2])

<matplotlib.axes._subplots.AxesSubplot at 0x7fb6ec7cd810>


columns = ['google', 'facebook', 'instagram']
X = df[columns]
y = df['sales']
model = LinearRegression()
model.fit(X,y)

LinearRegression()


# display the R square, to measure the accuracy of the LR models
model.score(X,y)

0.8972106381789522


# calculate intercept & coefficient for each variables google, FB and IG, IG shows negative correlationship with sales
# if we did not spend on the ads, intercept give us the value of sales without ads
# looking at the coefficient value for google and FB, FB provides stronger +tive coefficient relationship with sales, allocate 50K, then google 40K
print("Intercept :", model.intercept_)
print("Coefficeint :", model.coef_)

Intercept : 2.938889369459412
Coefficeint : [ 0.04576465  0.18853002 -0.00103749]


# using the intercept value we can predict the sales gained if we did not spend a single dollar on the ads
# assume the unit price for our product is 5000
# y = mx +c
# Google

unit_price = 5000
predicted_sales = 0
predicted_sales = (0.04576465 + 2.938889369459412 * 1000) * unit_price
print(f"Google : {predicted_sales:,.2f}")

unit_price = 5000
predicted_sales = 0
predicted_sales = (0.18853002  + 2.938889369459412 * 1000) * unit_price
print(f"Facebook : {predicted_sales:,.2f}")

unit_price = 5000
predicted_sales = 0
predicted_sales = (-0.00103749 + 2.938889369459412 * 1000) * unit_price
print(f"Instagram : {predicted_sales:,.2f}")

Google : 14,694,675.67
Facebook : 14,695,389.50
Instagram : 14,694,441.66


# convert the size column to binary data type that contains 0 and 1 value only with map function
# display the df
df['size'] = df['size'].map({"small":0, "large":1})
df


# Convert the area to numeric value 0 and 1 only for each type using dummies()
# display the df
df = pd.get_dummies(data=df, columns=['area'])
df


columns = ['google', 'facebook', 'instagram', 'size', 'area_rural', 'area_suburban', 'area_urban']
X = df[columns]
y = df['sales']

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,y)

LinearRegression()


# R square calculation, to measure the accuracy of the model
model.score(X,y)

0.8981611875394304


# calculate the intercept and coefficient
print("Intercept :", model.intercept_)
print("Coefficient :", model.coef_)

Intercept : 2.9280493329187873
Coefficient : [ 0.04574401  0.1878667  -0.0010877   0.07739661 -0.05385834 -0.16042133
  0.21427968]


# Marketing budget of 100K for small and urban area, 50K on google, 40K on FB and 10K on IG

prediction = model.predict([[50, 40, 10, 0, 0, 0, 1]])
print(f"Model Prediction : {prediction}")
unit_price = 5000
expected_revenue = 12.93332037 * 1000 * unit_price
print(f"Expected Revenue : {expected_revenue:,.2f}")

Model Prediction : [12.93332037]
Expected Revenue : 64,666,601.85

/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  "X does not have valid feature names, but"


# Marketing budget of 100K for small and rural area, 50K on google, 40K on FB and 10K on IG
prediction = model.predict([[50, 40, 10, 0, 1, 0, 0]])
print(f"Model Prediction : {prediction}")
unit_price = 5000
expected_revenue = 12.66518235 * 1000 * unit_price
print(f"Expected Revenue : {expected_revenue:,.2f}")

Model Prediction : [12.66518235]
Expected Revenue : 63,325,911.75

/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  "X does not have valid feature names, but"


# Marketing budget of 100K for large and urban area, 50K on google, 40K on FB and 10K on IG
prediction = model.predict([[50, 40, 10, 1, 0, 0, 1]])
print(f"Model Prediction : {prediction}")
unit_price = 5000
expected_revenue = 13.01071698 * 1000 * unit_price
print(f"Expected Revenue : {expected_revenue:,.2f}")

Model Prediction : [13.01071698]
Expected Revenue : 65,053,584.90

/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  "X does not have valid feature names, but"


# Marketing budget of 100K for large and suburban area, 50K on google, 40K on FB and 10K on IG
prediction = model.predict([[50, 40, 10, 1, 0, 1, 0]])
print(f"Model Prediction : {prediction}")
unit_price = 5000
expected_revenue = 12.63601597 * 1000 * unit_price
print(f"Expected Revenue : {expected_revenue:,.2f}")

Model Prediction : [12.63601597]
Expected Revenue : 63,180,079.85

/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  "X does not have valid feature names, but"


# Marketing budget of 100K for large and rural area, 50K on google, 40K on FB and 10K on IG
prediction = model.predict([[50, 40, 10, 1, 1, 0, 0]])
print(f"Model Prediction : {prediction}")
unit_price = 5000
expected_revenue = 12.74257896 * 1000 * unit_price
print(f"Expected Revenue : {expected_revenue:,.2f}")

Model Prediction : [12.74257896]
Expected Revenue : 63,712,894.80

/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  "X does not have valid feature names, but"


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   segment        200 non-null    int64  
 1   google         200 non-null    float64
 2   facebook       200 non-null    float64
 3   instagram      200 non-null    float64
 4   sales          200 non-null    float64
 5   size           200 non-null    int64  
 6   area_rural     200 non-null    uint8  
 7   area_suburban  200 non-null    uint8  
 8   area_urban     200 non-null    uint8  
dtypes: float64(4), int64(2), uint8(3)
memory usage: 10.1 KB

	segment	google	facebook	instagram	sales	size	area
0	1	230.1	37.8	69.2	22.1	large	rural
1	2	44.5	39.3	45.1	10.4	small	urban
2	3	17.2	45.9	69.3	9.3	small	rural
3	4	151.5	41.3	58.5	18.5	small	urban
4	5	180.8	10.8	58.4	12.9	large	suburban

	segment	google	facebook	instagram	sales	size	area
0	1	230.1	37.8	69.2	22.1	1	rural
1	2	44.5	39.3	45.1	10.4	0	urban
2	3	17.2	45.9	69.3	9.3	0	rural
3	4	151.5	41.3	58.5	18.5	0	urban
4	5	180.8	10.8	58.4	12.9	1	suburban
...	...	...	...	...	...	...	...
195	196	38.2	3.7	13.8	7.6	0	suburban
196	197	94.2	4.9	8.1	9.7	0	urban
197	198	177.0	9.3	6.4	12.8	0	suburban
198	199	283.6	42.0	66.2	25.5	0	rural
199	200	232.1	8.6	8.7	13.4	1	rural

	segment	google	facebook	instagram	sales	size	area_rural	area_suburban	area_urban
0	1	230.1	37.8	69.2	22.1	1	1	0	0
1	2	44.5	39.3	45.1	10.4	0	0	0	1
2	3	17.2	45.9	69.3	9.3	0	1	0	0
3	4	151.5	41.3	58.5	18.5	0	0	0	1
4	5	180.8	10.8	58.4	12.9	1	0	1	0
...	...	...	...	...	...	...	...	...	...
195	196	38.2	3.7	13.8	7.6	0	0	1	0
196	197	94.2	4.9	8.1	9.7	0	0	0	1
197	198	177.0	9.3	6.4	12.8	0	0	1	0
198	199	283.6	42.0	66.2	25.5	0	1	0	0
199	200	232.1	8.6	8.7	13.4	1	1	0	0

Data Understanding¶

Modelling¶