Web Scraping means extracting information from websites by parsing the HTML of the webpage.
website used for scraping : https://store.hp.com/in-en/default/laptops-tablets.html
Libraries used:
-
requests
Requests gets the web page for you -
BeautifulSoup
you need to parse the HTML from the page to retrieve the data. That is done by BeautifulSoup. -
re
regular expressions module used to make manipulations on data we got. -
pandas
To create the dataframe from the data we got by scraping and creating
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
Each Laptop product is rendered in the webpage as follow
prod_list=[]
def getData(products):
'''
Takes a List of products and scrapes the required specifications of each product and
append them to the prod_list defined above.
'''
for product in products:
#name
name=product.find('a',{"class":"product-item-link"}).text.strip()
#rating of product is in div tag with attribute data-by-average-overall-rating
rating=product.find('div',attrs={'data-bv-average-overall-rating' : True})['data-bv-average-overall-rating']
#processor family
processor=product.find('li',{'class':'processorfamily'})
if processor is not None:
processor=processor.text.strip()
else:
processor=None
# processor name, generation, type
if processor is not None:
try:
if processor.find('Intel')!=-1:
proc_company='Intel'
if processor.find('Core')!=-1: # processor type and generation for Intel core processors
generation=re.findall(r'\d',processor)[0]
proc_type=re.findall(r'i\d',processor)[0]
else:
temp=processor.split()
proc_type=' '.join(temp[1:-1]) # processor type for Intel pentium and other series processors
elif processor.find('AMD')!=-1: # processor type for AMD processors
proc_company='AMD'
temp=processor.split()
proc_type=temp[1]+' '+temp[2]
generation=None
else: # processor type for other processors
proc_company=None
proc_type=None
generation=None
except:
print('something went wrong')
else:
proc_company=None
proc_type=None
generation=None
# Os installed
os_installed=product.find('li',{'class':'osinstalled'})
if os_installed is not None:
os_installed=os_installed.text
else:
os_installed=None
# RAM
ram=product.find('li',{'class':'memstdes_01'})
if ram is not None:
ram=ram.text
else:
ram=None
#hard disk
hd=product.find('li',{'class':'hd_01des'})
if hd is not None:
hd=hd.text
else:
hd=None
# graphic card information is stored in <li> element with two different classes so multiple classes are used below
if product.find('li',{'class':['graphicseg_01card_01','graphicseg_02card_01']}) is not None:
graphic_card=product.find('li',{'class':['graphicseg_01card_01','graphicseg_02card_01']}).text
else:
graphic_card=None
#display-type
display_type=product.find('li',{'class':['display-displaydes']})
if display_type is None:
display_type=None
else:
display_type=display_type.text
#price
# actual price, discount and final price all stored in span tag with class price hence we get a list
prices=product.find_all('span',{'class':'price'})
if len(prices)==3:
aprice=prices[0].text[1:]
fprice=prices[1].text[1:]
dprice=prices[2].text[1:]
else:
aprice=fprice=prices[0].text[1:]
dprice='0'
#included items
inc_items=product.find_all('ul',{'class':'included'})
items_list=[]
if inc_items is not None:
for i in inc_items:
items_list=[item.text for item in i.find_all('li')]
inc_items=','.join(items_list) # converting the list of included items into string seperated by commas
print('Name:',name)
print('rating:',rating)
print("processor:",processor)
print("processor_company:",proc_company)
print("processor_type:",proc_type)
print("generation:",generation)
print("os_installed:",os_installed)
print("ram:",ram)
print("hard_disk:",hd)
print("graphic_card:",graphic_card)
print("display:",display_type)
print("Actual_price:",aprice)
print("final_price:",fprice)
print("Discount:",dprice)
print("included_items:",inc_items)
# creating a dictionary with keys as specification names and values as their respective information
prod_data={
'Name':name,
'rating':rating,
'processor':processor,
'processor_company':proc_company,
'processor_type':proc_type,
'generation':generation,
'os_installed':os_installed,
'ram':ram,
'hard_disk':hd,
'graphic_card':graphic_card,
'display':display_type,
'actual_price':aprice,
'final_price':fprice,
'discout':dprice,
'included_items':inc_items
}
#Appending the each product to list
prod_list.append(prod_data)
# end of the product
print('-'*50)
# end of all products
print('*'*50)
All the products in website is rendered across 6 pages so we set a loop and get the all products of each page at time and call the getData function defined above
for page_count in range(1,7):
url='https://store.hp.com/in-en/default/laptops-tablets.html?p='+str(page_count)+'&product_list_limit=30'
data=requests.get(url)
data_soup=BeautifulSoup(data.text,'lxml')
products=data_soup.find_all("div", {"class": "product-item-details"})
getData(products)
Name: HP ENVY x360 - 13-ag0035au
rating: 3.6667
processor: AMD Ryzen™ 5 processor
processor_company: AMD
processor_type: Ryzen™ 5
generation: None
os_installed: Windows 10 Home Single Language 64
ram: 8 GB DDR4-2400 SDRAM (onboard)
hard_disk: 256 GB SSD
graphic_card: AMD Radeon™ Vega 8 Graphics
display: 13.3" FHD multitouch-enabled edge-to-edge glass (1920 x 1080)
Actual_price: 83,496
final_price: 72,990
Discount: 10,506
included_items:
--------------------------------------------------
Name: HP Gaming Pavilion - 15-cx0140tx
rating: 5.0000
processor: 8th Generation Intel® Core™ i5 processor
processor_company: Intel
processor_type: i5
generation: 8
os_installed: Windows 10 Home Single Language 64
ram: 8 GB DDR4-2666 SDRAM (1 x 8 GB)
hard_disk: 1 TB 7200 rpm SATA
graphic_card: NVIDIA® GeForce® GTX 1050 (4 GB GDDR5 dedicated)
display: None
Actual_price: 86,476
final_price: 72,990
Discount: 13,486
included_items: HP Odyssey backpack (Worth ₹3,499),Microsoft Office Home and Student
--------------------------------------------------
Name: HP Notebook - 15-da0435tx
rating: 4.0000
processor: 7th Generation Intel® Core™ i3 processor
processor_company: Intel
processor_type: i3
generation: 7
os_installed: Windows 10 Home Single Language 64
ram: 8 GB DDR4-2133 SDRAM (1 x 8 GB)
hard_disk: 1 TB 5400 rpm SATA
graphic_card: NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated)
display: None
Actual_price: 50,292
final_price: 44,580
Discount: 5,712
included_items:
--------------------------------------------------
|
|
|
--------------------------------------------------
Name: HP ProBook 445 G2 Notebook PC (ENERGY STAR)
rating: 0.0000
processor: AMD A10 APU
processor_company: AMD
processor_type: A10 APU
generation: None
os_installed: Windows 10 Pro 64
ram: 8 GB DDR3L-1600 SDRAM (1 x 8 GB)
hard_disk: 500 GB 5400 rpm SATA
graphic_card: AMD Radeon™ R6
display: 14" diagonal HD anti-glare LED-backlit (1366 x 768)
Actual_price: 79,000
final_price: 78,000
Discount: 1,000
included_items: HP Overnighter Backpack (Worth ₹2,499)
--------------------------------------------------
**************************************************
creating a dataframe with the List of products
df=pd.DataFrame(prod_list)
df.head()
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
Name | actual_price | discout | display | final_price | generation | graphic_card | hard_disk | included_items | os_installed | processor | processor_company | processor_type | ram | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | HP ENVY x360 - 13-ag0035au | 83,496 | 10,506 | 13.3" FHD multitouch-enabled edge-to-edge glas... | 72,990 | None | AMD Radeon™ Vega 8 Graphics | 256 GB SSD | Windows 10 Home Single Language 64 | AMD Ryzen™ 5 processor | AMD | Ryzen™ 5 | 8 GB DDR4-2400 SDRAM (onboard) | 3.6667 | |
1 | HP Gaming Pavilion - 15-cx0140tx | 86,476 | 13,486 | None | 72,990 | 8 | NVIDIA® GeForce® GTX 1050 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA | HP Odyssey backpack (Worth ₹3,499),Microsoft O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core™ i5 processor | Intel | i5 | 8 GB DDR4-2666 SDRAM (1 x 8 GB) | 5.0000 |
2 | HP Notebook - 15-da0435tx | 50,292 | 5,712 | None | 44,580 | 7 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | Windows 10 Home Single Language 64 | 7th Generation Intel® Core™ i3 processor | Intel | i3 | 8 GB DDR4-2133 SDRAM (1 x 8 GB) | 4.0000 | |
3 | HP Notebook - 15g-dr0006tx | 66,137 | 7,146 | None | 58,991 | 8 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ₹1,123),1 Year O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core™ i5 processor | Intel | i5 | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 4.0625 |
4 | HP Notebook 15-da1030tu | 50,720 | 3,730 | None | 46,990 | 8 | None | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ₹1,123),Microsof... | Windows 10 Home 64 | 8th Generation Intel® Core™ i5 processor | Intel | i5 | 4 GB DDR4-2400 SDRAM | 2.0000 |
saving the dataframe into Hp_laptops.csv
df.to_csv("Hp_laptops.csv")
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # visualization
import re
%matplotlib inline
To get a brief overview, this notebook is dedicated to 5 sections as follows
- Context of dataset
- Data cleaning
- EDA
- Results
- Conclusion
Loading file Hp_laptos.csv into dataframe
df=pd.read_csv('Hp_laptops.csv',encoding='unicode-escape')
Let's have glance at a bit of data
df.head()
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
Unnamed: 0 | Name | actual_price | discout | display | final_price | generation | graphic_card | hard_disk | included_items | os_installed | processor | processor_company | processor_type | ram | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | HP ENVY x360 - 13-ag0035au | 83,496 | 10,506 | 13.3" FHD multitouch-enabled edge-to-edge glas... | 72,990 | NaN | AMD Radeon� Vega 8 Graphics | 256 GB SSD | NaN | Windows 10 Home Single Language 64 | AMD Ryzen� 5 processor | AMD | Ryzen� 5 | 8 GB DDR4-2400 SDRAM (onboard) | 3.6667 |
1 | 1 | HP Gaming Pavilion - 15-cx0140tx | 86,476 | 13,486 | NaN | 72,990 | 8.0 | NVIDIA® GeForce® GTX 1050 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA | HP Odyssey backpack (Worth ?3,499),Microsoft O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2666 SDRAM (1 x 8 GB) | 5.0000 |
2 | 2 | HP Notebook - 15-da0435tx | 50,292 | 5,712 | NaN | 44,580 | 7.0 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | NaN | Windows 10 Home Single Language 64 | 7th Generation Intel® Core� i3 processor | Intel | i3 | 8 GB DDR4-2133 SDRAM (1 x 8 GB) | 4.0000 |
3 | 3 | HP Notebook - 15g-dr0006tx | 66,137 | 7,146 | NaN | 58,991 | 8.0 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),1 Year O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 4.0625 |
4 | 4 | HP Notebook 15-da1030tu | 50,720 | 3,730 | NaN | 46,990 | 8.0 | NaN | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),Microsof... | Windows 10 Home 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 4 GB DDR4-2400 SDRAM | 2.0000 |
Information of each column in the dataframe
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 16 columns):
Unnamed: 0 163 non-null int64
Name 163 non-null object
actual_price 163 non-null object
discout 163 non-null object
display 29 non-null object
final_price 163 non-null object
generation 131 non-null float64
graphic_card 161 non-null object
hard_disk 161 non-null object
included_items 131 non-null object
os_installed 163 non-null object
processor 149 non-null object
processor_company 146 non-null object
processor_type 146 non-null object
ram 161 non-null object
rating 158 non-null float64
dtypes: float64(2), int64(1), object(13)
memory usage: 20.5+ KB
Shape of the dataframe
df.shape
(163, 16)
In the dataframe first column is no use at all
Dropping the first column
df.drop(['Unnamed: 0'],axis=1,inplace=True)
Data cleaning
- check if any NA's are present
df.isnull().sum()
Name 0
actual_price 0
discout 0
display 134
final_price 0
generation 32
graphic_card 2
hard_disk 2
included_items 32
os_installed 0
processor 14
processor_company 17
processor_type 17
ram 2
rating 5
dtype: int64
The column display has 134 NaN values which is more than 40% of total data. So we can drop it
df.drop(['display'],axis=1,inplace=True)
df.head()
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
Name | actual_price | discout | final_price | generation | graphic_card | hard_disk | included_items | os_installed | processor | processor_company | processor_type | ram | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | HP ENVY x360 - 13-ag0035au | 83,496 | 10,506 | 72,990 | NaN | AMD Radeon� Vega 8 Graphics | 256 GB SSD | NaN | Windows 10 Home Single Language 64 | AMD Ryzen� 5 processor | AMD | Ryzen� 5 | 8 GB DDR4-2400 SDRAM (onboard) | 3.6667 |
1 | HP Gaming Pavilion - 15-cx0140tx | 86,476 | 13,486 | 72,990 | 8.0 | NVIDIA® GeForce® GTX 1050 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA | HP Odyssey backpack (Worth ?3,499),Microsoft O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2666 SDRAM (1 x 8 GB) | 5.0000 |
2 | HP Notebook - 15-da0435tx | 50,292 | 5,712 | 44,580 | 7.0 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | NaN | Windows 10 Home Single Language 64 | 7th Generation Intel® Core� i3 processor | Intel | i3 | 8 GB DDR4-2133 SDRAM (1 x 8 GB) | 4.0000 |
3 | HP Notebook - 15g-dr0006tx | 66,137 | 7,146 | 58,991 | 8.0 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),1 Year O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 4.0625 |
4 | HP Notebook 15-da1030tu | 50,720 | 3,730 | 46,990 | 8.0 | NaN | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),Microsof... | Windows 10 Home 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 4 GB DDR4-2400 SDRAM | 2.0000 |
df.actual_price.dtype
dtype('O')
In the dataframe actual_price,final_price and discount columns should be of numerical datatype but they are of object datatype in the dataframe so function strtoint is definded below which removes the comma in the values and convert them into int datatype
def strtoint(column):
return column.apply(lambda x:int(x.replace(',','')))
df['actual_price']=strtoint(df['actual_price'])
df['discout']=strtoint(df['discout'])
df['final_price']=strtoint(df['final_price'])
df.head()
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
Name | actual_price | discout | final_price | generation | graphic_card | hard_disk | included_items | os_installed | processor | processor_company | processor_type | ram | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | HP ENVY x360 - 13-ag0035au | 83496 | 10506 | 72990 | NaN | AMD Radeon� Vega 8 Graphics | 256 GB SSD | NaN | Windows 10 Home Single Language 64 | AMD Ryzen� 5 processor | AMD | Ryzen� 5 | 8 GB DDR4-2400 SDRAM (onboard) | 3.6667 |
1 | HP Gaming Pavilion - 15-cx0140tx | 86476 | 13486 | 72990 | 8.0 | NVIDIA® GeForce® GTX 1050 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA | HP Odyssey backpack (Worth ?3,499),Microsoft O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2666 SDRAM (1 x 8 GB) | 5.0000 |
2 | HP Notebook - 15-da0435tx | 50292 | 5712 | 44580 | 7.0 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | NaN | Windows 10 Home Single Language 64 | 7th Generation Intel® Core� i3 processor | Intel | i3 | 8 GB DDR4-2133 SDRAM (1 x 8 GB) | 4.0000 |
3 | HP Notebook - 15g-dr0006tx | 66137 | 7146 | 58991 | 8.0 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),1 Year O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 4.0625 |
4 | HP Notebook 15-da1030tu | 50720 | 3730 | 46990 | 8.0 | NaN | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),Microsof... | Windows 10 Home 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 4 GB DDR4-2400 SDRAM | 2.0000 |
The generation column has 32 missing values Let's see the unique values and their frequency details
df.generation.value_counts()
8.0 99
7.0 29
6.0 3
Name: generation, dtype: int64
generations are for the Intel core processor series. AMD, Intel pentium series doesn't have generation so replace those with version 1
df.generation.fillna('1.0',inplace=True)
df.generation.isnull().sum()
0
The column graphic_card has two null values.Let's fill them with mode of that column
df.graphic_card.value_counts()[:5]
Intel® UHD Graphics 620 54
Intel® HD Graphics 620 19
AMD Radeon� Pro WX 3100 Graphics (2 GB GDDR5 dedicated) 12
NVIDIA® Quadro® P1000 (4 GB GDDR5 dedicated) 8
NVIDIA® Quadro® P600 (4 GB GDDR5 dedicated) 7
Name: graphic_card, dtype: int64
df.graphic_card.fillna(df.graphic_card.mode()[0],inplace=True)
df.graphic_card.isnull().sum()
0
df.hard_disk.value_counts()[:5]
1 TB 5400 rpm SATA 51
1 TB 7200 rpm SATA 16
512 GB PCIe® NVMe� SSD 13
512 GB SSD 13
256 GB SSD 11
Name: hard_disk, dtype: int64
df.hard_disk.fillna(df.hard_disk.mode()[0],inplace=True)
df.hard_disk.isnull().sum()
0
df.included_items.fillna('Not provided',inplace=True)
df.included_items.isnull().sum()
0
df.ram.fillna(df.ram.mode()[0],inplace=True)
df.included_items.isnull().sum()
0
df.rating.fillna(df.rating.mean(),inplace=True)
df.rating.isnull().sum()
0
df[df.processor_company.isnull()]
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
Name | actual_price | discout | final_price | generation | graphic_card | hard_disk | included_items | os_installed | processor | processor_company | processor_type | ram | rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
40 | HP ZBook 15v G5 Mobile Workstation | 225937 | 22095 | 203842 | 1.0 | NVIDIA® Quadro® P600 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA | HP Original Bag (Worth ?1,499) (#5DD44PA),3 Y... | Windows 10 Pro 64 | 16 GB DDR4-2666 SDRAM (1 x 16 GB) | NaN | NaN | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 1.194378 |
41 | HP ZBook 15v G5 Mobile Workstation | 233611 | 22861 | 210750 | 1.0 | NVIDIA® Quadro® P600 (4 GB GDDR5 dedicated) | 256 GB SSD | HP Original Bag (Worth ?1,499) (#5DD44PA),3 Y... | Windows 10 Pro 64 | NaN | NaN | NaN | 16 GB DDR4-2666 SDRAM (1 x 16 GB) | 1.194378 |
42 | HP ZBook 15v G5 Mobile Workstation | 216245 | 21125 | 195120 | 1.0 | NVIDIA® Quadro® P600 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA | HP Original Bag (Worth ?1,499) (#5DD44PA),3 Y... | Windows 10 Pro 64 | NaN | NaN | NaN | 16 GB DDR4-2666 SDRAM (1 x 16 GB) | 1.194378 |
43 | HP ZBook 15v G5 Mobile Workstation | 203281 | 19828 | 183453 | 1.0 | NVIDIA® Quadro® P600 (4 GB GDDR5 dedicated) | 2 TB 5400 rpm SATA | HP Original Bag (Worth ?1,499) (#5DD44PA),3 Y... | Windows 10 Pro 64 | NaN | NaN | NaN | 16 GB DDR4-2666 SDRAM (1 x 16 GB) | 1.194378 |
44 | HP ZBook 15v G5 Mobile Workstation | 181301 | 17100 | 164201 | 1.0 | NVIDIA® Quadro® P600 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA | HP Original Bag (Worth ?1,499) (#5DD44PA),3 Y... | Windows 10 Pro 64 | NaN | NaN | NaN | 16 GB DDR4-2666 SDRAM (1 x 16 GB) | 1.194378 |
106 | HP ZBook 15 G5 Mobile Workstation | 350406 | 124853 | 225553 | 1.0 | NVIDIA® Quadro® P2000 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA + 512 GB SSD | HP Business Backpack (Worth ?4,200) | Windows 10 Pro 64 | NaN | NaN | NaN | 16 GB (1x16 GB) DDR4 2666 | 0.000000 |
107 | HP ZBook 15 G5 Mobile Workstation | 316190 | 110771 | 205419 | 1.0 | NVIDIA® Quadro® P1000 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA + 128 GB SSD | HP Business Backpack (Worth ?4,200) | Windows 10 Pro 64 | NaN | NaN | NaN | 8 GB DDR4-2666 SDRAM (1 x 8 GB) | 0.000000 |
108 | HP ZBook 15 G5 Mobile Workstation | 240765 | 83941 | 156824 | 1.0 | NVIDIA® Quadro® P1000 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA | HP Business Backpack (Worth ?4,200) | Windows 10 Pro 64 | NaN | NaN | NaN | 8 GB DDR4-2666 SDRAM (1 x 8 GB) | 0.000000 |
109 | HP ZBook x2 G4 Detachable Workstation | 408283 | 145897 | 262386 | 1.0 | NVIDIA® Quadro® M620 (2 GB GDDR5 dedicated) | 512 GB SSD | HP Business Backpack (Worth ?4,200) | Windows 10 Pro 64 | NaN | NaN | NaN | 16 GB DDR4-2133 SDRAM (2 x 8 GB) | 0.000000 |
110 | HP ZBook x2 G4 Detachable Workstation | 369052 | 128360 | 240692 | 1.0 | NVIDIA® Quadro® M620 (2 GB GDDR5 dedicated) | 512 GB SSD | HP Business Backpack (Worth ?4,200) | Windows 10 Pro 64 | NaN | NaN | NaN | 8 GB DDR4-2133 SDRAM (2 x 4 GB) | 0.000000 |
111 | HP ZBook Studio x360 G5 Convertible Workstation | 400665 | 144380 | 256285 | 1.0 | NVIDIA® Quadro® P1000 (4 GB GDDR5 dedicated) | 1 TB SSD | HP Business Backpack (Worth ?4,200) | Windows 10 Pro 64 | NaN | NaN | NaN | 16 GB DDR4-2666 SDRAM (1 x 16 GB) | 0.000000 |
113 | HP ZBook Studio G5 Mobile Workstation | 254780 | 89781 | 164999 | 1.0 | NVIDIA® Quadro® P1000 (4 GB GDDR5 dedicated) | 512 GB SSD | HP Business Backpack (Worth ?4,200) | Windows 10 Pro 64 | NaN | NaN | NaN | 8 GB DDR4-2666 SDRAM (1 x 8 GB) | 0.000000 |
122 | HP ProBook 640 G4 Notebook PC | 95603 | 2830 | 92773 | 1.0 | Intel® UHD Graphics 620 | 1 TB 7200 rpm SATA | HP Overnighter Backpack (Worth ?2,499) | 8th Generation Intel® Core� i5 processor | NaN | NaN | NaN | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 0.000000 |
123 | HP ProBook 640 G4 Notebook PC | 106867 | 3164 | 103703 | 1.0 | Intel® UHD Graphics 620 | 1 TB 7200 rpm SATA | HP Overnighter Backpack (Worth ?2,499) | Intel Core i7-8550U Processor | NaN | NaN | NaN | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 0.000000 |
126 | HP ZBook 15u G4 Mobile Workstation | 176453 | 5453 | 171000 | 1.0 | AMD FirePro� W4190M Graphics (2 GB GDDR5 dedic... | 1 TB 5400 rpm SATA | Not provided | Windows 10 Pro 64 | 16 GB DDR4-2133 SDRAM (1 x 16 GB) | NaN | NaN | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 0.000000 |
146 | HP 245 G6 | 24200 | 0 | 24200 | 1.0 | Intel® UHD Graphics 620 | 1 TB 5400 rpm SATA | Not provided | FreeDOS | BU IDS UMA A9-9425 | NaN | NaN | 4GB (1x4GB) 1866 DDR4 | 0.000000 |
150 | HP ZBook 15u G5 Mobile Workstation | 264534 | 103017 | 161517 | 1.0 | AMD Radeon� Pro WX 3100 Graphics (2 GB GDDR5 d... | 512 GB SSD | HP Business Backpack (Worth ?4,200) | Windows 10 Pro 64 | NaN | NaN | NaN | 16 GB DDR4-2400 SDRAM (1 x 16 GB) | 0.000000 |
print(df.processor_type.mode()[0])
print(df.processor.mode()[0])
print(df.processor_company.mode()[0])
i5
8th Generation Intel® Core� i5 processor
Intel
df.at[38, 'processor']=df.processor.mode()[0]
df.at[120,'processor']=df.processor.mode()[0]
df.at[147,'processor']=df.processor.mode()[0]
df.processor.fillna(df.processor.mode()[0],inplace=True)
df.processor_company.fillna(df.processor_company.mode()[0],inplace=True)
df.processor_type.fillna(df.processor_type.mode()[0],inplace=True)
df.isnull().sum()
Name 0
actual_price 0
discout 0
final_price 0
generation 0
graphic_card 0
hard_disk 0
included_items 0
os_installed 0
processor 0
processor_company 0
processor_type 0
ram 0
rating 0
dtype: int64
import re
def getSize(value):
b=re.findall(r'\b\d+',str(value))
if len(b)>0:
return b[0]
return None
df['ram_size']=df['ram'].apply(getSize)
df['ram_size'].value_counts()
8 86
4 44
16 32
32 1
Name: ram_size, dtype: int64
df.ram_size.isnull().sum()
0
df.ram_size=df.ram_size.astype('int')
def getSize(value):
p=re.compile(r'\b\d+\s\w+')
val=p.findall(value)
size=None
if len(val)>0:
size=val[0]
size_val=re.findall(r'\d+',size)[0]
if 'TB' in size:
size_val=int(size_val)*1024
return size_val
return size
df['hd_size(GB)']=df.hard_disk.apply(getSize)
df['hd_size(GB)'].isnull().sum()
1
df['hd_size(GB)'].fillna(df['hd_size(GB)'].mode()[0],inplace=True)
df['hd_size(GB)']=df['hd_size(GB)'].astype('int')
df.head()
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
Name | actual_price | discout | final_price | generation | graphic_card | hard_disk | included_items | os_installed | processor | processor_company | processor_type | ram | rating | ram_size | hd_size(GB) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | HP ENVY x360 - 13-ag0035au | 83496 | 10506 | 72990 | 1.0 | AMD Radeon� Vega 8 Graphics | 256 GB SSD | Not provided | Windows 10 Home Single Language 64 | AMD Ryzen� 5 processor | AMD | Ryzen� 5 | 8 GB DDR4-2400 SDRAM (onboard) | 3.6667 | 8 | 256 |
1 | HP Gaming Pavilion - 15-cx0140tx | 86476 | 13486 | 72990 | 8 | NVIDIA® GeForce® GTX 1050 (4 GB GDDR5 dedicated) | 1 TB 7200 rpm SATA | HP Odyssey backpack (Worth ?3,499),Microsoft O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2666 SDRAM (1 x 8 GB) | 5.0000 | 8 | 1024 |
2 | HP Notebook - 15-da0435tx | 50292 | 5712 | 44580 | 7 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | Not provided | Windows 10 Home Single Language 64 | 7th Generation Intel® Core� i3 processor | Intel | i3 | 8 GB DDR4-2133 SDRAM (1 x 8 GB) | 4.0000 | 8 | 1024 |
3 | HP Notebook - 15g-dr0006tx | 66137 | 7146 | 58991 | 8 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),1 Year O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 4.0625 | 8 | 1024 |
4 | HP Notebook 15-da1030tu | 50720 | 3730 | 46990 | 8 | Intel® UHD Graphics 620 | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),Microsof... | Windows 10 Home 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 4 GB DDR4-2400 SDRAM | 2.0000 | 4 | 1024 |
df.isnull().sum()
Name 0
actual_price 0
discout 0
final_price 0
generation 0
graphic_card 0
hard_disk 0
included_items 0
os_installed 0
processor 0
processor_company 0
processor_type 0
ram 0
rating 0
ram_size 0
hd_size(GB) 0
dtype: int64
df.to_csv('Hp_laptops_new.csv')
df.actual_price.plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x1476f947cf8>
df[(df.final_price>30000)&(df.final_price<60000) & (df['hd_size(GB)']==1024) & (df['ram_size']==8)]
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
Name | actual_price | discout | final_price | generation | graphic_card | hard_disk | included_items | os_installed | processor | processor_company | processor_type | ram | rating | ram_size | hd_size(GB) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | HP Notebook - 15-da0435tx | 50292 | 5712 | 44580 | 7 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | Not provided | Windows 10 Home Single Language 64 | 7th Generation Intel® Core� i3 processor | Intel | i3 | 8 GB DDR4-2133 SDRAM (1 x 8 GB) | 4.0000 | 8 | 1024 |
3 | HP Notebook - 15g-dr0006tx | 66137 | 7146 | 58991 | 8 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),1 Year O... | Windows 10 Home Single Language 64 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 4.0625 | 8 | 1024 |
5 | HP Pavilion - 15-cw0027au | 70025 | 10035 | 59990 | 1.0 | AMD Radeon� Vega 8 Graphics | 1 TB 5400 rpm SATA + 128 GB SSD | HP Trendsetter Bag (Worth ?2,356) | Windows 10 Home Single Language 64 | AMD Ryzen� 5 processor | AMD | Ryzen� 5 | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 3.0000 | 8 | 1024 |
48 | HP 348 G4 Notebook PC | 55539 | 2756 | 52783 | 7 | Intel® HD Graphics 620 | 1 TB 7200 rpm SATA | HP Original Bag (Worth ?1,499) (#5DD44PA) | FreeDOS | 7th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2133 SDRAM (1 x 8 GB) | 5.0000 | 8 | 1024 |
115 | HP Notebook - 15-da0074tx | 46010 | 6516 | 39494 | 7 | NVIDIA® GeForce® MX110 (2 GB DDR3 dedicated) | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123) | FreeDOS 2.0 | 7th Generation Intel® Core� i3 processor | Intel | i3 | 8 GB DDR4-2133 SDRAM (1 x 8 GB) | 5.0000 | 8 | 1024 |
117 | HP Notebook - 15-da0077tx | 57786 | 7065 | 50721 | 8 | NVIDIA® GeForce® MX110 (2 GB GDDR5 dedicated) | 1 TB 5400 rpm SATA | HP Original Laptop Bag (Worth ?1,123),1 Year O... | FreeDOS 2.0 | 8th Generation Intel® Core� i5 processor | Intel | i5 | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 5.0000 | 8 | 1024 |
df.processor_company.value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x1476fe8ae80>
Intel company processor is used more
df.processor.value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x1476ffea390>
df.groupby(['processor_company']).mean()
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
actual_price | discout | final_price | rating | ram_size | hd_size(GB) | |
---|---|---|---|---|---|---|
processor_company | ||||||
AMD | 57390.000000 | 2974.181818 | 54415.818182 | 1.712127 | 5.818182 | 625.454545 |
Intel | 134072.809211 | 22852.592105 | 111220.217105 | 1.156910 | 8.842105 | 796.894737 |
df.os_installed.value_counts()
Windows 10 Pro 64 85
Windows 10 Home Single Language 64 43
FreeDOS 2.0 13
Chrome OS� 64 5
Windows 10 Home 64 3
Windows 10 Home Single Language 64 � HP recommends Windows 10 Pro. 3
FreeDOS 2
Windows 10 Pro 64 � HP recommends Windows 10 Pro. 2
Free DOS 1
FreeDos 2.0 1
Windows 7 Professional 64 1
FreeDOS 3.0 1
8th Generation Intel® Core� i5 processor 1
Windows 10 Pro 1
Intel Core i7-8550U Processor 1
Name: os_installed, dtype: int64
df.os_installed.value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x147700539b0>
df.ram_size.corr(df.actual_price)
0.76195099638699681
df.ram_size.value_counts().plot(kind='pie')
<matplotlib.axes._subplots.AxesSubplot at 0x14771156f98>
df.hard_disk.value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x14771176668>
df.corr()
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
actual_price | discout | final_price | rating | ram_size | hd_size(GB) | |
---|---|---|---|---|---|---|
actual_price | 1.000000 | 0.856027 | 0.956428 | -0.237846 | 0.761951 | -0.124959 |
discout | 0.856027 | 1.000000 | 0.667800 | -0.233100 | 0.498787 | -0.133535 |
final_price | 0.956428 | 0.667800 | 1.000000 | -0.210822 | 0.815429 | -0.104509 |
rating | -0.237846 | -0.233100 | -0.210822 | 1.000000 | -0.033313 | 0.152389 |
ram_size | 0.761951 | 0.498787 | 0.815429 | -0.033313 | 1.000000 | -0.032704 |
hd_size(GB) | -0.124959 | -0.133535 | -0.104509 | 0.152389 | -0.032704 | 1.000000 |
df.describe()
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
actual_price | discout | final_price | rating | ram_size | hd_size(GB) | |
---|---|---|---|---|---|---|
count | 163.000000 | 163.000000 | 163.000000 | 163.000000 | 163.000000 | 163.000000 |
mean | 128897.895706 | 21511.104294 | 107386.791411 | 1.194378 | 8.638037 | 785.325153 |
std | 100316.252958 | 39349.265400 | 69667.796932 | 1.831696 | 4.487049 | 511.257767 |
min | 20999.000000 | 0.000000 | 20999.000000 | 0.000000 | 4.000000 | 16.000000 |
25% | 61889.500000 | 1886.000000 | 58016.000000 | 0.000000 | 4.000000 | 512.000000 |
50% | 88742.000000 | 3286.000000 | 84390.000000 | 0.000000 | 8.000000 | 1024.000000 |
75% | 173869.500000 | 11370.000000 | 146606.000000 | 2.235350 | 8.000000 | 1024.000000 |
max | 667152.000000 | 224481.000000 | 442671.000000 | 5.000000 | 32.000000 | 5400.000000 |
df.describe(include=[np.object]).transpose()
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
count | unique | top | freq | |
---|---|---|---|---|
Name | 163 | 93 | HP ZBook 15u G5 Mobile Workstation | 9 |
generation | 163 | 4 | 8 | 99 |
graphic_card | 163 | 38 | Intel® UHD Graphics 620 | 56 |
hard_disk | 163 | 31 | 1 TB 5400 rpm SATA | 53 |
included_items | 163 | 23 | Not provided | 32 |
os_installed | 163 | 15 | Windows 10 Pro 64 | 85 |
processor | 163 | 30 | 8th Generation Intel® Core� i5 processor | 58 |
processor_company | 163 | 2 | Intel | 152 |
processor_type | 163 | 16 | i5 | 73 |
ram | 163 | 33 | 8 GB DDR4-2400 SDRAM (1 x 8 GB) | 56 |
print(df[df.actual_price>100000]['Name'].unique())
['HP Pavilion 14-ce1003tx' 'HP EliteBook x360 1030 G3 Notebook PC'
'HP EliteBook 830 G5 Notebook PC' 'HP EliteBook 840r G4 Notebook PC'
'HP ProBook x360 440 G1 Notebook PC' 'HP ZBook 15v G5 Mobile Workstation'
'OMEN by HP - 15-dc0084tx' 'HP EliteBook x360 1030 G2'
'HP EliteBook 1040 G4 Notebook PC'
'HP EliteBook x360 1030 G2 (ENERGY STAR)' 'HP ProBook 430 G6 Notebook PC'
'HP ZBook 15u G5 Mobile Workstation' 'HP Pavilion x360 14-dh0045tx'
'HP ZBook 14u G5 Mobile Workstation' 'HP Spectre x360 - 13-ap0122tu'
'HP Spectre x360 - 13-ap0121tu' 'HP Spectre x360 - 13-ap0102tu'
'HP Spectre x360 - 13-ap0101tu' 'HP Spectre x360 - 13-ap0100tu'
'HP ZBook 17 G5 Mobile Workstation' 'HP EliteBook 735 G5 Notebook PC'
'HP EliteBook 1050 G1 Notebook PC' 'HP ZBook Studio G5 Mobile Workstation'
'HP ZBook 15 G5 Mobile Workstation'
'HP ZBook x2 G4 Detachable Workstation'
'HP ZBook Studio x360 G5 Convertible Workstation' 'HP ENVY - 13-ah0044tx'
'HP ProBook 640 G4 Notebook PC' 'HP ZBook 15u G4 Mobile Workstation'
'HP Elite x2 1012 G1 Tablet (ENERGY STAR)' 'OMEN by HP 15-dc1006tx'
'HP Spectre Folio - 13-ak0040tu' 'HP Elite x2 1013 G3 Tablet'
'HP EliteBook x360 1020 G2']
Laptop with highest price
df[df.actual_price==df.actual_price.max()]
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
Name | actual_price | discout | final_price | generation | graphic_card | hard_disk | included_items | os_installed | processor | processor_company | processor_type | ram | rating | ram_size | hd_size(GB) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
85 | HP ZBook 17 G5 Mobile Workstation | 667152 | 224481 | 442671 | 7 | NVIDIA® Quadro® P4200 (8 GB GDDR5 dedicated) | 512 GB PCIe® NVMe� SSD | Not provided | Windows 10 Pro 64 | Intel® Xeon® processor | Intel | Xeon® | 32 GB DDR4-2666 ECC SDRAM (2 X 16 GB) | 0.0 | 32 | 512 |
plt.title("ram size vs actual price")
plt.xlabel("ram size")
plt.ylabel("price")
plt.scatter(df['ram_size'],df['actual_price'])
<matplotlib.collections.PathCollection at 0x14771fb99b0>
- Intel company processors are mostly preferred
- Windows 10 Pro 64 and 8th Generation Intel i5 processor is used in good number
- ram size effect the price of laptops.They have good correlation 0.7
- 9 laptops models are there in economical range
- 34 laptop models are there with price greater than one lakh
- 1 TB 5400 RPM SATA is used for many models