diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..5172429
Binary files /dev/null and b/.DS_Store differ
diff --git a/README.md b/README.md
index 292be4e..6179bd1 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,10 @@
Product Matching Using Machine Learning
- The topic is about product matching via Machine Learning. This involves using various machine learning techniques such as natural language processing image recognition and collaborative filtering algorithms to match similar products together. To implement this project a low-level project structure is suggested with different folders for data notebooks source code and testing. The sequence of model implementation and useful Python libraries for product matching via Machine Learning are also described. Finally a 3-month timeline is presented for the development to deployment of the product matching project.
+
+ <<< *REVAMP IN PROGRESS* >>>
+
+ The topic is about product matching via Machine Learning. This involves using various machine learning techniques such as natural language processing image recognition and collaborative filtering algorithms to match similar products together. To implement this project a low-level project structure is suggested with different folders for data notebooks source code and testing. The sequence of model implementation and useful Python libraries for product matching via Machine Learning are also described. Finally a 3-month timeline is presented for the development to deployment of the product matching project.
diff --git a/StuctureMaker.py b/StuctureMaker.py
deleted file mode 100644
index 68f2edc..0000000
--- a/StuctureMaker.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import os
-
-# Create main folders
-folders = ['data', 'notebooks', 'src', 'tests']
-for folder in folders:
- os.makedirs(folder)
-
-# Create subfolders inside data folder
-data_folders = ['raw', 'processed', 'models']
-for folder in data_folders:
- os.makedirs(os.path.join('data', folder))
-
-# Create subfolders inside src folder
-src_folders = ['models', 'preprocessing', 'utils']
-for folder in src_folders:
- os.makedirs(os.path.join('src', folder))
-
-# Create sample files inside the folders
-with open(os.path.join('data', 'raw', 'data.csv'), 'w') as f:
- f.write('Sample raw data')
-
-with open(os.path.join('data', 'processed', 'data.csv'), 'w') as f:
- f.write('Sample processed data')
-
-with open(os.path.join('data', 'models', 'model.pkl'), 'wb') as f:
- f.write(b'Sample model pickle file')
-
-with open(os.path.join('src', 'models', 'model.py'), 'w') as f:
- f.write('Sample model source code')
-
-with open(os.path.join('src', 'preprocessing', 'preprocessing.py'), 'w') as f:
- f.write('Sample preprocessing source code')
-
-with open(os.path.join('src', 'utils', 'utils.py'), 'w') as f:
- f.write('Sample utility source code')
-
-with open(os.path.join('notebooks', 'EDA.ipynb'), 'w') as f:
- f.write('Sample Jupyter notebook for EDA')
-
-with open(os.path.join('tests', 'test_models.py'), 'w') as f:
- f.write('Sample unit tests for models')
diff --git a/src/.DS_Store b/src/.DS_Store
new file mode 100644
index 0000000..0bcf869
Binary files /dev/null and b/src/.DS_Store differ
diff --git a/src/data_scraping/.DS_Store b/src/data_scraping/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/src/data_scraping/.DS_Store differ
diff --git a/src/data_scraping/Amazon/__init__.py b/src/data_scraping/Amazon/__init__.py
deleted file mode 100644
index e1f257f..0000000
--- a/src/data_scraping/Amazon/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from scripts.main import Scraper
-
-__all__ = ['Scraper']
diff --git a/src/data_scraping/Amazon/amazon.db b/src/data_scraping/Amazon/amazon.db
deleted file mode 100644
index adef09e..0000000
Binary files a/src/data_scraping/Amazon/amazon.db and /dev/null differ
diff --git a/src/data_scraping/Amazon/scripts/__pycache__/dbConnector.cpython-310.pyc b/src/data_scraping/Amazon/scripts/__pycache__/dbConnector.cpython-310.pyc
deleted file mode 100644
index fe0a691..0000000
Binary files a/src/data_scraping/Amazon/scripts/__pycache__/dbConnector.cpython-310.pyc and /dev/null differ
diff --git a/src/data_scraping/Amazon/scripts/__pycache__/genricHtmlib.cpython-310.pyc b/src/data_scraping/Amazon/scripts/__pycache__/genricHtmlib.cpython-310.pyc
deleted file mode 100644
index fdec6b2..0000000
Binary files a/src/data_scraping/Amazon/scripts/__pycache__/genricHtmlib.cpython-310.pyc and /dev/null differ
diff --git a/src/data_scraping/Amazon/scripts/__pycache__/productList.cpython-310.pyc b/src/data_scraping/Amazon/scripts/__pycache__/productList.cpython-310.pyc
deleted file mode 100644
index ac379f4..0000000
Binary files a/src/data_scraping/Amazon/scripts/__pycache__/productList.cpython-310.pyc and /dev/null differ
diff --git a/src/data_scraping/Amazon/scripts/__pycache__/useragent.cpython-310.pyc b/src/data_scraping/Amazon/scripts/__pycache__/useragent.cpython-310.pyc
deleted file mode 100644
index adc92ae..0000000
Binary files a/src/data_scraping/Amazon/scripts/__pycache__/useragent.cpython-310.pyc and /dev/null differ
diff --git a/src/data_scraping/Amazon/scripts/productList.py b/src/data_scraping/Amazon/scripts/productList.py
deleted file mode 100644
index 4b7644b..0000000
--- a/src/data_scraping/Amazon/scripts/productList.py
+++ /dev/null
@@ -1,28 +0,0 @@
-product_categories = [
- 'mobiles',
- 'laptops',
- 'televisions',
- 'cameras',
- 'headphones',
- 'speakers',
- 'watches',
- 'air-conditioners',
- 'refrigerators',
- 'washing-machines',
- 'books',
- 'shoes',
- 'clothing',
- 'bags',
- 'jewellery',
- 'home-decor',
- 'home-furnishing',
- 'cricket-equipments',
- 'football-equipments',
- 'badminton-equipments',
- 'table-tennis-equipments',
- 'gym-equipments',
- 'kitchen-appliances',
- 'home-appliances',
- 'home-entertainment',
- 'home-improvement-tools',
-]
\ No newline at end of file
diff --git a/src/data_scraping/Amazon/scripts/useragent.py b/src/data_scraping/Amazon/scripts/useragent.py
deleted file mode 100644
index f4a7712..0000000
--- a/src/data_scraping/Amazon/scripts/useragent.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from random import choice
-
-UserAgents = [
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
- "Mozilla/5.0 (iPhone; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/105.0.5195.98 Mobile/15E148 Safari/604.1",
- "Mozilla/5.0 (iPad; CPU OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/105.0.5195.98 Mobile/15E148 Safari/604.1",
- "Mozilla/5.0 (iPod; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/105.0.5195.98 Mobile/15E148 Safari/604.1",
- "Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.77 Mobile Safari/537.36",
- "Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.77 Mobile Safari/537.36",
- "Mozilla/5.0 (Linux; Android 10; SM-A102U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.77 Mobile Safari/537.36",
- "Mozilla/5.0 (Linux; Android 10; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.77 Mobile Safari/537.36",
- "Mozilla/5.0 (Linux; Android 10; SM-N960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.77 Mobile Safari/537.36",
- "Mozilla/5.0 (Linux; Android 10; LM-Q720) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.77 Mobile Safari/537.36",
- "Mozilla/5.0 (Linux; Android 10; LM-X420) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.77 Mobile Safari/537.36",
- "Mozilla/5.0 (Linux; Android 10; LM-Q710(FGN)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.77 Mobile Safari/537.36",
-]
-
-def get_ua():
- return choice(UserAgents)
-
-# import the choice function from the random module
-# create a list of user agents
-# create a function that returns a random user agent from the list
-# return the user agent
-
-# user agents are used to identify the browser, operating system, device, and/or application that is making a request to a website
\ No newline at end of file
diff --git a/src/data_scraping/Flipkart/flipkart.db b/src/data_scraping/Flipkart/flipkart.db
deleted file mode 100644
index 78685bd..0000000
Binary files a/src/data_scraping/Flipkart/flipkart.db and /dev/null differ
diff --git a/src/data_scraping/Flipkart/scripts/__pycache__/dbConnector.cpython-310.pyc b/src/data_scraping/Flipkart/scripts/__pycache__/dbConnector.cpython-310.pyc
deleted file mode 100644
index 7c9262e..0000000
Binary files a/src/data_scraping/Flipkart/scripts/__pycache__/dbConnector.cpython-310.pyc and /dev/null differ
diff --git a/src/data_scraping/Flipkart/scripts/__pycache__/genricHtmlib.cpython-310.pyc b/src/data_scraping/Flipkart/scripts/__pycache__/genricHtmlib.cpython-310.pyc
deleted file mode 100644
index 7fb7150..0000000
Binary files a/src/data_scraping/Flipkart/scripts/__pycache__/genricHtmlib.cpython-310.pyc and /dev/null differ
diff --git a/src/data_scraping/Flipkart/scripts/__pycache__/productList.cpython-310.pyc b/src/data_scraping/Flipkart/scripts/__pycache__/productList.cpython-310.pyc
deleted file mode 100644
index 6148340..0000000
Binary files a/src/data_scraping/Flipkart/scripts/__pycache__/productList.cpython-310.pyc and /dev/null differ
diff --git a/src/data_scraping/Flipkart/scripts/__pycache__/useragent.cpython-310.pyc b/src/data_scraping/Flipkart/scripts/__pycache__/useragent.cpython-310.pyc
deleted file mode 100644
index ab753e9..0000000
Binary files a/src/data_scraping/Flipkart/scripts/__pycache__/useragent.cpython-310.pyc and /dev/null differ
diff --git a/src/data_scraping/Flipkart/scripts/dbConnector.py b/src/data_scraping/Flipkart/scripts/dbConnector.py
deleted file mode 100644
index 89f4244..0000000
--- a/src/data_scraping/Flipkart/scripts/dbConnector.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import sqlite3
-import os
-
-class FlipkartDatabaseConnector:
- def __init__(self, stamp):
- self.dbPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../flipkart.db")
- self.conn = sqlite3.connect(self.dbPath)
- self.cur = self.conn.cursor()
- self.welcomeMessage = "Welcome to Flipkart Scraper. This is the database for the Flipkart Scraper. This database was created on {}.".format(stamp)
-
- def schemaMaker(self):
- # creating tables
- self.cur.execute("""CREATE TABLE products (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- sku TEXT NOT NULL,
- name TEXT NOT NULL,
- description TEXT NOT NULL,
- image_path TEXT NOT NULL,
- category TEXT NOT NULL,
- timestamp TEXT NOT NULL,
- URL TEXT NOT NULL,
- price TEXT NOT NULL
- );""")
- self.conn.commit()
- self.cur.execute("CREATE TABLE product_matches (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, product_sku INTEGER NOT NULL, match_id INTEGER NOT NULL, match_sku INTEGER NOT NULL);")
- self.conn.commit()
-
- def insertProduct(self, productDetails):
- self.cur.execute("INSERT INTO products (sku, name, description, image_path, category, timestamp, URL, price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (productDetails["sku"], productDetails["name"], productDetails["description"], productDetails["image_path"], productDetails["category"], productDetails["timestamp"], productDetails["URL"], productDetails["price"]))
- self.conn.commit()
-
- def fetchAllProducts(self):
- self.cur.execute("SELECT * FROM products")
- return self.cur.fetchall()
-
- def clearDatabase(self):
- self.cur.execute("DELETE FROM products")
- self.conn.commit()
- self.cur.execute("DELETE FROM product_matches")
- self.conn.commit()
-
- def removeDuplicates(self):
- self.cur.execute("DELETE FROM products WHERE rowid NOT IN (SELECT MIN(rowid) FROM products GROUP BY sku)")
- self.conn.commit()
\ No newline at end of file
diff --git a/src/data_scraping/Flipkart/scripts/genricHtmlib.py b/src/data_scraping/Flipkart/scripts/genricHtmlib.py
deleted file mode 100644
index a8aa265..0000000
--- a/src/data_scraping/Flipkart/scripts/genricHtmlib.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import asyncio
-from multiprocessing import Pool
-import httpx
-import logging
-import os
-from datetime import datetime
-import lxml.html as html
-import pandas as pd
-import time
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-import warnings
-import requests
-import json
-import ndjson
-from useragent import get_ua
-
-warnings.filterwarnings("ignore")
-
-class SeleniumScraper:
- def __init__(self, timeout=10):
- self.timeout = timeout
- self.session = httpx.AsyncClient(timeout=self.timeout)
- self.reqSession = requests.Session()
- self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
- self.storagePath = os.path.join(
- os.path.dirname(os.path.abspath(__file__)), "../"
- )
-
- logging.basicConfig(
- filename=self.storagePath + "logs/FlipkartScraper_{}.log".format(self.stamp),
- level=logging.INFO,
- filemode="w",
- )
- self.headers = {
- 'authority': 'www.amazon.com',
- 'pragma': 'no-cache',
- 'cache-control': 'no-cache',
- 'dnt': '1',
- 'upgrade-insecure-requests': '1',
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'sec-fetch-site': 'none',
- 'sec-fetch-mode': 'navigate',
- 'sec-fetch-dest': 'document',
- 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
- }
-
- async def fetch_request_async(self, url, params=None):
- try:
- headers = {
- "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
- }
- response = await self.session.get(url, headers=headers)
-
- if response.status_code == 200:
- print("Response status code successful for url: {} and status code: {}".format(url, 200))
- return response.text
-
- if response.status_code == 301:
- # retry with redirect
- response = await self.session.get(response.headers['Location'])
- response.raise_for_status()
- if response.status_code == 200:
- return response.text
-
- except Exception as e:
- logging.info(
- "Exception occurred for url: {} and exception: {}".format(url, e)
- )
- pass
- return None
-
- def fetch_request_normal(self, url, params=None):
- try:
- headers = {
- "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
- }
- response = self.reqSession.get(url, headers=headers)
-
- if response.status_code == 200:
- #print("Response status code successful for url: {} and status code: {}".format(url, 200))
- return response.text
-
- if response.status_code == 301:
- print("Response status code successful for url: {} and status code: {}".format(url, 301))
- # retry with redirect
- response = requests.get(response.headers['Location'])
- response.raise_for_status()
- if response.status_code == 200:
- return response.text
-
- if response.status_code == 503:
- print("Response status code successful for url: {} and status code: {}".format(url, 503))
- return None
-
- except Exception as e:
- logging.info(
- "Exception occurred for url: {} and exception: {}".format(url, e)
- )
- print("Exception occurred for url: {} and exception: {}".format(url, e))
- pass
- return None
-
- def get_xpath_link(self, doc, xpath, website):
- try:
- name = doc.xpath("".join(xpath))
- for i in range(len(name)):
- if name[i].startswith("/"):
- name[i] = website + name[i]
- else:
- name[i] = name[i]
- return name
-
- except Exception as e:
- logging.info("Error in getting {}: {}".format(name, e))
- pass
- return None
- pass
-
- def get_selenium_driver(self):
- chrome_options = Options()
- #chrome_options.add_argument("--headless")
- chrome_options.add_argument("--window-size=1920,1080")
- chrome_options.add_argument("--disable-gpu")
- chrome_options.add_argument("--no-sandbox")
- chrome_options.add_argument("--disable-dev-shm-usage")
- chrome_options.add_argument("--disable-extensions")
- chrome_options.add_argument("--disable-logging")
- chrome_options.add_argument("--log-level=3")
- chrome_options.add_argument("--silent")
- chrome_options.add_argument("--blink-settings=imagesEnabled=false")
- driver = webdriver.Chrome(chrome_options=chrome_options)
- return driver
-
- def fetch_request_selenium(self, url, waiting_time=1):
- try:
- driver = self.get_selenium_driver()
- driver.get(url)
- time.sleep(waiting_time)
- doc = html.fromstring(driver.page_source)
- logging.info("Response status code successful for url: {} and status code: {}".format(url, 200))
- driver.close()
- return doc
-
- except Exception as e:
- logging.info(
- "Exception occurred for url: {} and exception: {}".format(url, e)
- )
- pass
-
- def get_xpath_data(self, doc, xpath):
- try:
- name = doc.xpath(xpath)
- return name
-
- except Exception as e:
- print("Error in getting {}: {}".format(name, e))
- pass
- return None
-
- def slow_page_scroll(self, driver, speed):
- current_scroll_position = driver.execute_script("return window.pageYOffset;")
- while current_scroll_position < driver.execute_script(
- "return document.body.scrollHeight;"
- ):
- driver.execute_script(
- "window.scrollTo(0, arguments[0]);", current_scroll_position
- )
- current_scroll_position += 1000
- time.sleep(speed)
-
- def data_storage(self, df_list, unique_id, name):
- df_combined = pd.concat(df_list, ignore_index=True)
- df_combined.drop_duplicates(subset=unique_id, inplace=True)
- df_combined.to_csv(
- self.storagePath + "raw/" + "{}_{}.csv".format(name, self.stamp),
- index=False,
- )
-
- def cleanData(self, array):
- array = [x.strip() for x in array]
- array = list(filter(None, array))
- array = [x.encode("ascii", "ignore").decode() for x in array]
- array = [x.replace("\n", "") for x in array]
- return array
-
-
\ No newline at end of file
diff --git a/src/data_scraping/Flipkart/scripts/productList.py b/src/data_scraping/Flipkart/scripts/productList.py
deleted file mode 100644
index 4b7644b..0000000
--- a/src/data_scraping/Flipkart/scripts/productList.py
+++ /dev/null
@@ -1,28 +0,0 @@
-product_categories = [
- 'mobiles',
- 'laptops',
- 'televisions',
- 'cameras',
- 'headphones',
- 'speakers',
- 'watches',
- 'air-conditioners',
- 'refrigerators',
- 'washing-machines',
- 'books',
- 'shoes',
- 'clothing',
- 'bags',
- 'jewellery',
- 'home-decor',
- 'home-furnishing',
- 'cricket-equipments',
- 'football-equipments',
- 'badminton-equipments',
- 'table-tennis-equipments',
- 'gym-equipments',
- 'kitchen-appliances',
- 'home-appliances',
- 'home-entertainment',
- 'home-improvement-tools',
-]
\ No newline at end of file
diff --git a/src/data_scraping/__init__.py b/src/data_scraping/__init__.py
new file mode 100644
index 0000000..dd11995
--- /dev/null
+++ b/src/data_scraping/__init__.py
@@ -0,0 +1,7 @@
+from .amazon_scraper import AmazonScraper
+from .flipkart_scraper import FlipkartScraper
+
+__all__ = [
+ "AmazonScraper",
+ "FlipkartScraper"
+]
\ No newline at end of file
diff --git a/src/data_scraping/Amazon/scripts/main.py b/src/data_scraping/amazon_scraper.py
similarity index 55%
rename from src/data_scraping/Amazon/scripts/main.py
rename to src/data_scraping/amazon_scraper.py
index 0829ad3..fb3cfc4 100644
--- a/src/data_scraping/Amazon/scripts/main.py
+++ b/src/data_scraping/amazon_scraper.py
@@ -2,17 +2,15 @@
import os
import logging
from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor
from lxml import html
import re
-from productList import product_categories
-import requests
-SeleniumScraper = SeleniumScraper()
+from lib import RequestHandler, htmlLib, DatabaseUtil
-class Scraper:
+class AmazonScraper(htmlLib):
def __init__(self):
- self.rival = "amazon"
+ super().__init__()
+ self.wesbite_name = "amazon"
self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
self.url = "https://www.amazon.in/s?k={}&page={}&ref=sr_pg_{}"
self.website = "https://www.amazon.in"
@@ -24,18 +22,18 @@ def getProducts(self, keyword, page):
try:
url = self.url.format(keyword, page, page)
- response = SeleniumScraper.fetch_request_normal(url)
+ response = htmlLib.fetch_request_normal(url)
if response == None:
print("Normal request failed, trying selenium")
- doc = SeleniumScraper.fetch_request_selenium(url)
+ doc = htmlLib.fetch_request_selenium(url)
else:
doc = html.fromstring(response)
- productUrls = SeleniumScraper.get_xpath_link(doc, self.productUrlXpath, self.website)
+ productUrls = htmlLib.get_xpath_link(doc, self.productUrlXpath, self.website)
print(f"Found {len(productUrls)} products for product {keyword} on page {page}")
- pagination = SeleniumScraper.get_xpath_link(doc, self.paginationXpath, self.website)
+ pagination = htmlLib.get_xpath_link(doc, self.paginationXpath, self.website)
pagination = re.findall(r'\d+', pagination[0])
self.pagination = int(pagination[1])
return productUrls
@@ -46,10 +44,10 @@ def getProducts(self, keyword, page):
def getProductDetails(self, productUrl):
try:
- response = SeleniumScraper.fetch_request_normal(productUrl)
+ response = htmlLib.fetch_request_normal(productUrl)
if response == None:
print("Normal request failed, trying selenium")
- doc = SeleniumScraper.fetch_request_selenium(productUrl)
+ doc = htmlLib.fetch_request_selenium(productUrl)
else:
doc = html.fromstring(response)
except Exception as e:
@@ -65,35 +63,35 @@ def getProductDetails(self, productUrl):
except:
sku = []
- name = SeleniumScraper.cleanData(SeleniumScraper.get_xpath_data(doc, '//*[@id="productTitle"]//text()'))
+ name = htmlLib.cleanData(htmlLib.get_xpath_data(doc, '//*[@id="productTitle"]//text()'))
try:
- description = SeleniumScraper.cleanData(SeleniumScraper.get_xpath_data(doc, '//*[@id="productDescription"]//span//text()'))
+ description = htmlLib.cleanData(htmlLib.get_xpath_data(doc, '//*[@id="productDescription"]//span//text()'))
description = " ".join(description)
except Exception as e:
try:
- description = SeleniumScraper.cleanData(SeleniumScraper.get_xpath_data(doc, '//*[@id="feature-bullets"]//span//text()'))
+ description = htmlLib.cleanData(htmlLib.get_xpath_data(doc, '//*[@id="feature-bullets"]//span//text()'))
description = " ".join(description)
except:
logging.error(f"Error while scraping product description for product {productUrl}: {e}")
try:
- image_path = SeleniumScraper.cleanData(SeleniumScraper.get_xpath_data(doc, '//*[@id="landingImage"]//@src'))
+ image_path = htmlLib.cleanData(htmlLib.get_xpath_data(doc, '//*[@id="landingImage"]//@src'))
image_path = image_path[0]
except Exception as e:
try:
- image_path = SeleniumScraper.cleanData(SeleniumScraper.get_xpath_data(doc, '//*[@id="imgTagWrapperId"]//@src'))
+ image_path = htmlLib.cleanData(htmlLib.get_xpath_data(doc, '//*[@id="imgTagWrapperId"]//@src'))
image_path = ''.join(image_path)
except:
logging.error(f"Error while scraping product image for product {productUrl}: {e}")
- category = SeleniumScraper.cleanData(SeleniumScraper.get_xpath_data(doc, '//*[@class="a-link-normal a-color-tertiary"]//text()'))
+ category = htmlLib.cleanData(htmlLib.get_xpath_data(doc, '//*[@class="a-link-normal a-color-tertiary"]//text()'))
try:
category = category[-1]
except:
category = []
try:
- price = SeleniumScraper.cleanData(SeleniumScraper.get_xpath_data(doc, '//*[@class="a-price-whole"]//text()'))[0]
+ price = htmlLib.cleanData(htmlLib.get_xpath_data(doc, '//*[@class="a-price-whole"]//text()'))[0]
price = price.replace(",", "")
price = int(price)
except Exception as e:
@@ -135,51 +133,51 @@ def getProductDetails(self, productUrl):
print(productDetails)
return productDetails
- def main(self, keyword, number_of_threads):
- # get products
- products = []
- for page in range(1, self.pagination+1):
- products.extend(self.getProducts(keyword, page))
+ # def main(self, keyword, number_of_threads):
+ # # get products
+ # products = []
+ # for page in range(1, self.pagination+1):
+ # products.extend(self.getProducts(keyword, page))
- if self.pagination > 1:
- for page in range(2, self.pagination+1):
- products.extend(self.getProducts(keyword, page))
+ # if self.pagination > 1:
+ # for page in range(2, self.pagination+1):
+ # products.extend(self.getProducts(keyword, page))
- # get product details
- with ThreadPoolExecutor(max_workers=number_of_threads) as executor:
- results = executor.map(self.getProductDetails, products)
+ # # get product details
+ # with ThreadPoolExecutor(max_workers=number_of_threads) as executor:
+ # results = executor.map(self.getProductDetails, products)
- # save to db
- for result in results:
- print(f"Saving {result['sku']} to db")
- self.db.insertProduct(result)
+ # # save to db
+ # for result in results:
+ # print(f"Saving {result['sku']} to db")
+ # self.db.insertProduct(result)
-if __name__ == '__main__':
+# if __name__ == '__main__':
- number_of_threads = 10
- scraper = Scraper()
-
- # Log start of scraper
- logging.info(f"Starting {scraper.rival} scraper")
-
- # make db amazon.db if it doesn't exist
- if not os.path.exists(scraper.storagePath + scraper.rival + ".db"):
- print(f'Creating amazon.db at {scraper.storagePath+ scraper.rival + ".db"}')
- db = AmazonDatabaseConnector(scraper.stamp)
- logging.info(f"Creating {scraper.rival}.db")
- db.schemaMaker()
+# number_of_threads = 10
+# scraper = Scraper()
+
+# # Log start of scraper
+# logging.info(f"Starting {scraper.rival} scraper")
+
+# # make db amazon.db if it doesn't exist
+# if not os.path.exists(scraper.storagePath + scraper.rival + ".db"):
+# print(f'Creating amazon.db at {scraper.storagePath+ scraper.rival + ".db"}')
+# db = AmazonDatabaseConnector(scraper.stamp)
+# logging.info(f"Creating {scraper.rival}.db")
+# db.schemaMaker()
- scraper.db = AmazonDatabaseConnector(scraper.stamp)
+# scraper.db = AmazonDatabaseConnector(scraper.stamp)
- for keyword in product_categories:
- scraper.main(keyword, number_of_threads)
- scraper.pagination = 1
- SeleniumScraper.reqSession = requests.Session()
+# for keyword in product_categories:
+# scraper.main(keyword, number_of_threads)
+# scraper.pagination = 1
+# htmlLib.reqSession = requests.Session()
- scraper.db.removeDuplicates()
+# scraper.db.removeDuplicates()
diff --git a/src/data_scraping/database.db b/src/data_scraping/database.db
deleted file mode 100644
index e69de29..0000000
diff --git a/src/data_scraping/Flipkart/scripts/main.py b/src/data_scraping/flipkart_scraper.py
similarity index 98%
rename from src/data_scraping/Flipkart/scripts/main.py
rename to src/data_scraping/flipkart_scraper.py
index 5eabf2f..8dbdb98 100644
--- a/src/data_scraping/Flipkart/scripts/main.py
+++ b/src/data_scraping/flipkart_scraper.py
@@ -9,7 +9,7 @@
SeleniumScraper = SeleniumScraper()
-class Scraper:
+class FlipkartScraper:
def __init__(self):
self.rival: str = "flipkart"
self.website = "https://www.flipkart.com/search?q="
@@ -171,6 +171,6 @@ def start(self):
self.db.removeDuplicates()
-if __name__ == '__main__':
- scraper = Scraper()
- scraper.start()
\ No newline at end of file
+# if __name__ == '__main__':
+# scraper = Scraper()
+# scraper.start()
\ No newline at end of file
diff --git a/lib/__init__.py b/src/lib/__init__.py
similarity index 56%
rename from lib/__init__.py
rename to src/lib/__init__.py
index f5d20d3..27739a0 100644
--- a/lib/__init__.py
+++ b/src/lib/__init__.py
@@ -1,11 +1,13 @@
-from .generic_html import ScraperSupport
+from .generic_html import htmlLib
from .product_list import product_categories
from .database_util import DatabaseUtil
from .request_handler import RequestHandler
+from .useragent_util import get_ua as get_user_agent
__all__ = [
- "ScraperSupport",
+ "htmlLib",
"product_categories",
"DatabaseUtil",
- "RequestHandler"
+ "RequestHandler",
+ "get_user_agent"
]
\ No newline at end of file
diff --git a/lib/database_util.py b/src/lib/database_util.py
similarity index 100%
rename from lib/database_util.py
rename to src/lib/database_util.py
diff --git a/lib/generic_html.py b/src/lib/generic_html.py
similarity index 86%
rename from lib/generic_html.py
rename to src/lib/generic_html.py
index cc96632..5d647b2 100644
--- a/lib/generic_html.py
+++ b/src/lib/generic_html.py
@@ -9,9 +9,6 @@
class htmlLib:
def __init__(self, timeout=10):
self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
- self.storagePath = os.path.join(
- os.path.dirname(os.path.abspath(__file__)), "../"
- )
def get_xpath_link(self, doc, xpath, website):
try:
@@ -40,11 +37,11 @@ def get_xpath_data(self, doc, xpath):
pass
return None
- def data_storage(self, df_list, unique_id, name):
+ def data_storage(self, df_list, unique_id, name, storagePath):
df_combined = pd.concat(df_list, ignore_index=True)
df_combined.drop_duplicates(subset=unique_id, inplace=True)
df_combined.to_csv(
- self.storagePath + "raw/" + "{}_{}.csv".format(name, self.stamp),
+ f"{storagePath}/{name}_{self.stamp}.csv",
index=False,
)
diff --git a/lib/product_list.py b/src/lib/product_list.py
similarity index 100%
rename from lib/product_list.py
rename to src/lib/product_list.py
diff --git a/lib/request_handler.py b/src/lib/request_handler.py
similarity index 100%
rename from lib/request_handler.py
rename to src/lib/request_handler.py
diff --git a/src/data_scraping/Flipkart/scripts/useragent.py b/src/lib/useragent_util.py
similarity index 87%
rename from src/data_scraping/Flipkart/scripts/useragent.py
rename to src/lib/useragent_util.py
index f4a7712..584b37f 100644
--- a/src/data_scraping/Flipkart/scripts/useragent.py
+++ b/src/lib/useragent_util.py
@@ -21,10 +21,3 @@
def get_ua():
return choice(UserAgents)
-
-# import the choice function from the random module
-# create a list of user agents
-# create a function that returns a random user agent from the list
-# return the user agent
-
-# user agents are used to identify the browser, operating system, device, and/or application that is making a request to a website
\ No newline at end of file
diff --git a/lib/readme.md b/src/utils/constants.py
similarity index 100%
rename from lib/readme.md
rename to src/utils/constants.py