Skip to content

Commit

Permalink
added lib
Browse files Browse the repository at this point in the history
  • Loading branch information
paritoshtripathi935 committed Feb 4, 2024
1 parent a16b887 commit 8530eba
Show file tree
Hide file tree
Showing 13 changed files with 374 additions and 520 deletions.
11 changes: 11 additions & 0 deletions lib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from .generic_html import ScraperSupport
from .product_list import product_categories
from .database_util import DatabaseUtil
from .request_handler import RequestHandler

__all__ = [
"ScraperSupport",
"product_categories",
"DatabaseUtil",
"RequestHandler"
]
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import sqlite3
import os
import sys

class AmazonDatabaseConnector:
def __init__(self, stamp):
self.dbPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../amazon.db")
self.conn = sqlite3.connect(self.dbPath)
self.cur = self.conn.cursor()
self.welcomeMessage = "Welcome to Amazon Scraper. This is the database for the Amazon Scraper. This database was created on {}.".format(stamp)
class DatabaseUtil:
def __init__(self, stamp, database_name):
self.dbPath = "data/{}.db".format(database_name)
self.conn, self.cur = self.connect(self.dbPath)
self.welcomeMessage = f"Welcome to {database_name} Scraper. This is the database for the {database_name} Scraper. This database was created on {stamp}."

def schemaMaker(self):
# creating tables
Expand Down Expand Up @@ -49,4 +49,20 @@ def convertDBtoCsv(self):
for row in self.cur:
f.write(str(row))
f.write(' ')
self.conn.commit()
self.conn.commit()

def connect(self, db_name: str):
if not os.path.exists(db_name):
print(f"The database {db_name} does not exist")
sys.exit(1)

conn = sqlite3.connect(db_name)
cursor = conn.cursor()
return conn, cursor

def generate_stats(self):
cursor = self.connect(self.database)
cursor.execute("SELECT COUNT(DISTINCT sku) FROM products")
products = cursor.fetchall()
self.stats["products"] = products[0][0]
return self.stats
58 changes: 58 additions & 0 deletions lib/generic_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import logging
import os
from datetime import datetime
import lxml.html as html
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

class htmlLib:
def __init__(self, timeout=10):
self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
self.storagePath = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "../"
)

def get_xpath_link(self, doc, xpath, website):
try:
name = doc.xpath("".join(xpath))
print(name)
for i in range(len(name)):
if name[i].startswith("/"):
name[i] = website + name[i]
else:
name[i] = name[i]
return name

except Exception as e:
logging.info("Error in getting {}: {}".format(name, e))
pass
return None
pass

def get_xpath_data(self, doc, xpath):
try:
name = doc.xpath(xpath)
return name

except Exception as e:
print("Error in getting {}: {}".format(name, e))
pass
return None

def data_storage(self, df_list, unique_id, name):
df_combined = pd.concat(df_list, ignore_index=True)
df_combined.drop_duplicates(subset=unique_id, inplace=True)
df_combined.to_csv(
self.storagePath + "raw/" + "{}_{}.csv".format(name, self.stamp),
index=False,
)

def cleanData(self, array):
array = [x.strip() for x in array]
array = list(filter(None, array))
array = [x.encode("ascii", "ignore").decode() for x in array]
array = [x.replace("\n", "") for x in array]
return array


180 changes: 0 additions & 180 deletions lib/genricHtmlib.py

This file was deleted.

28 changes: 28 additions & 0 deletions lib/product_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
product_categories = [
'mobiles',
'laptops',
'televisions',
'cameras',
'headphones',
'speakers',
'watches',
'air-conditioners',
'refrigerators',
'washing-machines',
'books',
'shoes',
'clothing',
'bags',
'jewellery',
'home-decor',
'home-furnishing',
'cricket-equipments',
'football-equipments',
'badminton-equipments',
'table-tennis-equipments',
'gym-equipments',
'kitchen-appliances',
'home-appliances',
'home-entertainment',
'home-improvement-tools',
]
Loading

0 comments on commit 8530eba

Please sign in to comment.