diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000..f5d20d3 --- /dev/null +++ b/lib/__init__.py @@ -0,0 +1,11 @@ +from .generic_html import ScraperSupport +from .product_list import product_categories +from .database_util import DatabaseUtil +from .request_handler import RequestHandler + +__all__ = [ + "ScraperSupport", + "product_categories", + "DatabaseUtil", + "RequestHandler" +] \ No newline at end of file diff --git a/src/data_scraping/Amazon/scripts/dbConnector.py b/lib/database_util.py similarity index 67% rename from src/data_scraping/Amazon/scripts/dbConnector.py rename to lib/database_util.py index 3d0d2c2..b764ca3 100644 --- a/src/data_scraping/Amazon/scripts/dbConnector.py +++ b/lib/database_util.py @@ -1,12 +1,12 @@ import sqlite3 import os +import sys -class AmazonDatabaseConnector: - def __init__(self, stamp): - self.dbPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../amazon.db") - self.conn = sqlite3.connect(self.dbPath) - self.cur = self.conn.cursor() - self.welcomeMessage = "Welcome to Amazon Scraper. This is the database for the Amazon Scraper. This database was created on {}.".format(stamp) +class DatabaseUtil: + def __init__(self, stamp, database_name): + self.dbPath = "data/{}.db".format(database_name) + self.conn, self.cur = self.connect(self.dbPath) + self.welcomeMessage = f"Welcome to {database_name} Scraper. This is the database for the {database_name} Scraper. This database was created on {stamp}." def schemaMaker(self): # creating tables @@ -49,4 +49,20 @@ def convertDBtoCsv(self): for row in self.cur: f.write(str(row)) f.write(' ') - self.conn.commit() \ No newline at end of file + self.conn.commit() + + def connect(self, db_name: str): + if not os.path.exists(db_name): + print(f"The database {db_name} does not exist") + sys.exit(1) + + conn = sqlite3.connect(db_name) + cursor = conn.cursor() + return conn, cursor + + def generate_stats(self): + cursor = self.connect(self.database) + cursor.execute("SELECT COUNT(DISTINCT sku) FROM products") + products = cursor.fetchall() + self.stats["products"] = products[0][0] + return self.stats diff --git a/lib/generic_html.py b/lib/generic_html.py new file mode 100644 index 0000000..cc96632 --- /dev/null +++ b/lib/generic_html.py @@ -0,0 +1,58 @@ +import logging +import os +from datetime import datetime +import lxml.html as html +import pandas as pd +import warnings +warnings.filterwarnings("ignore") + +class htmlLib: + def __init__(self, timeout=10): + self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + self.storagePath = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "../" + ) + + def get_xpath_link(self, doc, xpath, website): + try: + name = doc.xpath("".join(xpath)) + print(name) + for i in range(len(name)): + if name[i].startswith("/"): + name[i] = website + name[i] + else: + name[i] = name[i] + return name + + except Exception as e: + logging.info("Error in getting {}: {}".format(name, e)) + pass + return None + pass + + def get_xpath_data(self, doc, xpath): + try: + name = doc.xpath(xpath) + return name + + except Exception as e: + print("Error in getting {}: {}".format(name, e)) + pass + return None + + def data_storage(self, df_list, unique_id, name): + df_combined = pd.concat(df_list, ignore_index=True) + df_combined.drop_duplicates(subset=unique_id, inplace=True) + df_combined.to_csv( + self.storagePath + "raw/" + "{}_{}.csv".format(name, self.stamp), + index=False, + ) + + def cleanData(self, array): + array = [x.strip() for x in array] + array = list(filter(None, array)) + array = [x.encode("ascii", "ignore").decode() for x in array] + array = [x.replace("\n", "") for x in array] + return array + + \ No newline at end of file diff --git a/lib/genricHtmlib.py b/lib/genricHtmlib.py deleted file mode 100644 index ba24d72..0000000 --- a/lib/genricHtmlib.py +++ /dev/null @@ -1,180 +0,0 @@ -import asyncio -from multiprocessing import Pool -import httpx -import logging -import os -from datetime import datetime -import lxml.html as html -import pandas as pd -import time -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -import warnings -import requests -import json -import ndjson -warnings.filterwarnings("ignore") - -class SeleniumScraper: - def __init__(self, timeout=10): - self.timeout = timeout - self.session = httpx.AsyncClient(timeout=self.timeout) - self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - self.storagePath = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "../" - ) - - logging.basicConfig( - filename=self.storagePath + "logs/amazonScraper_{}.log".format(self.stamp), - level=logging.INFO, - filemode="w", - ) - self.headers = { - 'authority': 'www.amazon.com', - 'pragma': 'no-cache', - 'cache-control': 'no-cache', - 'dnt': '1', - 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'sec-fetch-site': 'none', - 'sec-fetch-mode': 'navigate', - 'sec-fetch-dest': 'document', - 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', - } - - async def fetch_request_async(self, url, params=None): - try: - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" - } - response = await self.session.get(url, headers=headers) - - if response.status_code == 200: - print("Response status code successful for url: {} and status code: {}".format(url, 200)) - return response.text - - if response.status_code == 301: - # retry with redirect - response = await self.session.get(response.headers['Location']) - response.raise_for_status() - if response.status_code == 200: - return response.text - - except Exception as e: - logging.info( - "Exception occurred for url: {} and exception: {}".format(url, e) - ) - pass - return None - - def fetch_request_normal(self, url, params=None): - try: - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" - } - response = requests.get(url, headers=headers) - - if response.status_code == 200: - print("Response status code successful for url: {} and status code: {}".format(url, 200)) - return response.text - - if response.status_code == 301: - # retry with redirect - response = requests.get(response.headers['Location']) - response.raise_for_status() - if response.status_code == 200: - return response.text - - except Exception as e: - logging.info( - "Exception occurred for url: {} and exception: {}".format(url, e) - ) - pass - return None - - def get_xpath_link(self, doc, xpath, website): - try: - name = doc.xpath("".join(xpath)) - print(name) - for i in range(len(name)): - if name[i].startswith("/"): - name[i] = website + name[i] - else: - name[i] = name[i] - return name - - except Exception as e: - logging.info("Error in getting {}: {}".format(name, e)) - pass - return None - pass - - def get_selenium_driver(self): - chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--window-size=1920,1080") - chrome_options.add_argument("--disable-gpu") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - chrome_options.add_argument("--disable-extensions") - chrome_options.add_argument("--disable-logging") - chrome_options.add_argument("--log-level=3") - chrome_options.add_argument("--silent") - chrome_options.add_argument("--blink-settings=imagesEnabled=false") - driver = webdriver.Chrome(chrome_options=chrome_options) - return driver - - async def fetch_request_selenium(self, url, waiting_time=1): - try: - driver = await self.get_selenium_driver() - driver.get(url) - time.sleep(waiting_time) - doc = html.fromstring(driver.page_source) - logging.info("Response status code successful for url: {} and status code: {}".format(url, 200)) - driver.close() - return doc - - except Exception as e: - logging.info( - "Exception occurred for url: {} and exception: {}".format(url, e) - ) - pass - - def get_xpath_data(self, doc, xpath): - try: - name = doc.xpath(xpath) - return name - - except Exception as e: - print("Error in getting {}: {}".format(name, e)) - pass - return None - - def slow_page_scroll(self, driver, speed): - current_scroll_position = driver.execute_script("return window.pageYOffset;") - while current_scroll_position < driver.execute_script( - "return document.body.scrollHeight;" - ): - driver.execute_script( - "window.scrollTo(0, arguments[0]);", current_scroll_position - ) - current_scroll_position += 1000 - time.sleep(speed) - - def data_storage(self, df_list, unique_id, name): - df_combined = pd.concat(df_list, ignore_index=True) - df_combined.drop_duplicates(subset=unique_id, inplace=True) - df_combined.to_csv( - self.storagePath + "raw/" + "{}_{}.csv".format(name, self.stamp), - index=False, - ) - - def cleanData(self, array): - array = [x.strip() for x in array] - array = list(filter(None, array)) - array = [x.encode("ascii", "ignore").decode() for x in array] - array = [x.replace("\n", "") for x in array] - return array - - \ No newline at end of file diff --git a/lib/product_list.py b/lib/product_list.py new file mode 100644 index 0000000..4b7644b --- /dev/null +++ b/lib/product_list.py @@ -0,0 +1,28 @@ +product_categories = [ + 'mobiles', + 'laptops', + 'televisions', + 'cameras', + 'headphones', + 'speakers', + 'watches', + 'air-conditioners', + 'refrigerators', + 'washing-machines', + 'books', + 'shoes', + 'clothing', + 'bags', + 'jewellery', + 'home-decor', + 'home-furnishing', + 'cricket-equipments', + 'football-equipments', + 'badminton-equipments', + 'table-tennis-equipments', + 'gym-equipments', + 'kitchen-appliances', + 'home-appliances', + 'home-entertainment', + 'home-improvement-tools', +] \ No newline at end of file diff --git a/lib/request_handler.py b/lib/request_handler.py new file mode 100644 index 0000000..acb8b11 --- /dev/null +++ b/lib/request_handler.py @@ -0,0 +1,230 @@ +import time +import requests +import tls_client +import traceback +import random +import sys +sys.path.append("src") + + +class RequestHandler: + """ + A class that handles HTTP requests with proxy rotation and retry mechanism. + + Args: + max_retries (int) : The maximum number of retries for a failed request. Default is 3. + proxies (list, optional) : A list of proxies. Default is None. + clients (list, mandatory) : A list of client identifiers. Default is None. + + Attributes: + max_retries (int) : The maximum number of retries for a failed request. + clients (list) : A list of client identifiers. + proxies_list (list) : A list of proxies. + status_code_count (dict) : A dictionary to keep track of the count of different status codes. + + Methods: + get_random_clients : Returns a random client identifier from the list. + get_proxy : Returns a list of proxies. + handle_request : Handles the HTTP request with proxy rotation and retry mechanism. + + Inherited Classes: + GeneralUtils : A class that handles general utilities. + + Examples: + + >>> from utils.request_handler import RequestHandler + >>> request_handler = RequestHandler() + + """ + + def __init__(self, max_retries=5): + super().__init__() + self.max_retries = max_retries + self.tls_client = ['android11','android12','android13','android14','chrome111','chrome112','chrome113','chrome114'] + self.status_code_count = {} + self.status_code_count["total_number_of_requests"] = 0 + self.status_code_count["total_number_of_success"] = 0 + self.status_code_count["total_number_of_failed"] = 0 + self.status_code_count["total_number_of_exceptions"] = 0 + self.successful_status_codes = [200, 201] + self.failed_status_codes = [401, 403, 404, 409, 429, 400] + self.redirect_status_codes = [301, 302, 303, 307, 308] + self.server_down_status_codes = [500, 502, 503, 504] + self.session_cookies = None + + def get_random_clients(self): + return random.choice(self.tls_client) + + def handle_request( + self, + url, + method, + headers=None, + data=None, + params=None, + json=None, + requestType="tls", + country_code="sg", + proxies=None, + ): + """ + Handles the HTTP request with proxy rotation and retry mechanism. + + Args: + url (str) : The URL of the request. + method (str) : The HTTP method of the request. + headers (dict, optional) : The headers of the request. Default is None. + data (dict, optional) : The data of the request. Default is None. + params (dict, optional) : The query parameters of the request. Default is None. + json (dict, optional) : The JSON data of the request. Default is None. + requestType (str, optional) : The type of request. Default is "tls". + country_code (str, optional): The country code for proxy selection. Default is "at". + + Returns: + requests.Response or None : The response object if the request is successful, None otherwise. + + """ + retries = 0 + + while retries < self.max_retries: + retries += 1 + + self.status_code_count["total_number_of_requests"] += 1 + + session = tls_client.Session( + client_identifier=self.get_random_clients(), + random_tls_extension_order=True, + ) + + try: + if requestType == "tls": + if params: + url += f"&{params}" if "?" in url else f"?{params}" + response = session.execute_request( + method=method, + url=url, + headers=headers, + data=data, + json=json, + proxy=proxies, + cookies=self.session_cookies, + ) + + else: + response = requests.request( + method=method, + url=url, # type: ignore + headers=headers, + data=data, + json=json, + proxies=proxies, + params=params, + cookies=self.session_cookies, + ) + + self.session_cookies = response.cookies + self.handling_logging(f"Request: {url} {response.status_code}") + self.status_code_count[response.status_code] = ( + self.status_code_count.get(response.status_code, 0) + 1 + ) + + if response.status_code in self.successful_status_codes: + self.status_code_count["total_number_of_success"] += 1 + + self.handling_logging( + f""" + [SUCCESS] [{response.status_code}] + Request successful {url} {response.status_code} + Max retries: {self.max_retries} + Retries: {retries} + Headers: {headers} + Payload: {data} + """ + ) + + return response + + elif response.status_code in self.failed_status_codes: + self.handling_logging( + f""" + [BLOCKED] [{response.status_code}] + Request blocked {url} {response.status_code} {response.text} + Max retries: {self.max_retries} + Retries: {retries} + Proxy: {proxies} + Headers: {headers} + """, + "warning", + ) + + elif response.status_code in self.redirect_status_codes: + url = response.headers.get("location", None) + if url: + self.handling_logging( + f""" + [REDIRECT] [{response.status_code}] + Redirecting to {url} + Max retries: {self.max_retries} + Retries: {retries} + """, + "warning", + ) + else: + self.handling_logging( + f""" + [ERROR] [{response.status_code}] + No redirect url found + Max retries: {self.max_retries} + Retries: {retries} + """, + "error", + ) + return response + + elif response.status_code in self.server_down_status_codes: + self.handling_logging( + f""" + [SERVER DOWN] [{response.status_code}] + Server down {url} {response.status_code} {response.text} + Max retries: {self.max_retries} + Retries: {retries} + """, + "warning", + ) + time.sleep(10) + self.max_retries = 10 + + else: + self.handling_logging( + f""" + [ERROR] [{response.status_code}] + Request failed {url} {response.status_code} {response.text} + Max retries: {self.max_retries} + Retries: {retries} + """, + "error", + ) + + except Exception as e: + self.handling_logging( + f""" + [EXCEPTION] + {traceback.format_exc()} + {e} + """, + "error", + ) + self.status_code_count["total_number_of_exceptions"] += 1 + + + self.handling_logging(f""" + [FAILED] + url: {url} + Max retries: {self.max_retries} + Retries: {retries} + """, "error") + + self.status_code_count["total_number_of_failed"] += 1 + return None + + diff --git a/src/data_scraping/Amazon/scripts/genricHtmlib.py b/src/data_scraping/Amazon/scripts/genricHtmlib.py deleted file mode 100644 index 909fd9a..0000000 --- a/src/data_scraping/Amazon/scripts/genricHtmlib.py +++ /dev/null @@ -1,182 +0,0 @@ -from multiprocessing import Pool -import httpx -import logging -import os -from datetime import datetime -import lxml.html as html -import pandas as pd -import time -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -import warnings -import requests -warnings.filterwarnings("ignore") - -class SeleniumScraper: - def __init__(self, timeout=10): - self.timeout = timeout - self.session = httpx.AsyncClient(timeout=self.timeout) - self.reqSession = requests.Session() - self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - self.storagePath = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "../" - ) - - logging.basicConfig( - filename=self.storagePath + "logs/amazonScraper_{}.log".format(self.stamp), - level=logging.INFO, - filemode="w", - ) - self.headers = { - 'authority': 'www.amazon.com', - 'pragma': 'no-cache', - 'cache-control': 'no-cache', - 'dnt': '1', - 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'sec-fetch-site': 'none', - 'sec-fetch-mode': 'navigate', - 'sec-fetch-dest': 'document', - 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', - } - - async def fetch_request_async(self, url, params=None): - try: - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" - } - response = await self.session.get(url, headers=headers) - - if response.status_code == 200: - print("Response status code successful for url: {} and status code: {}".format(url, 200)) - return response.text - - if response.status_code == 301: - # retry with redirect - response = await self.session.get(response.headers['Location']) - response.raise_for_status() - if response.status_code == 200: - return response.text - - - except Exception as e: - logging.info( - "Exception occurred for url: {} and exception: {}".format(url, e) - ) - pass - return None - - def fetch_request_normal(self, url, params=None): - try: - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36" - } - response = self.reqSession.get(url, headers=headers) - - if response.status_code == 200: - return response.text - - if response.status_code == 301: - # retry with redirect - response = requests.get(response.headers['Location']) - response.raise_for_status() - if response.status_code == 200: - return response.text - - if response.status_code == 503: - print("Response status code successful for url: {} and status code: {}".format(url, 503)) - return None - - except Exception as e: - logging.info( - "Exception occurred for url: {} and exception: {}".format(url, e) - ) - print("Exception occurred for url: {} and exception: {}".format(url, e)) - pass - return None - - def get_xpath_link(self, doc, xpath, website): - try: - name = doc.xpath("".join(xpath)) - for i in range(len(name)): - if name[i].startswith("/"): - name[i] = website + name[i] - else: - name[i] = name[i] - return name - - except Exception as e: - logging.info("Error in getting {}: {}".format(name, e)) - pass - return None - pass - - def get_selenium_driver(self): - chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--window-size=1920,1080") - chrome_options.add_argument("--disable-gpu") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - chrome_options.add_argument("--disable-extensions") - chrome_options.add_argument("--disable-logging") - chrome_options.add_argument("--log-level=3") - chrome_options.add_argument("--silent") - chrome_options.add_argument("--blink-settings=imagesEnabled=false") - driver = webdriver.Chrome(chrome_options=chrome_options) - return driver - - def fetch_request_selenium(self, url, waiting_time=1): - try: - driver = self.get_selenium_driver() - driver.get(url) - time.sleep(waiting_time) - doc = html.fromstring(driver.page_source) - logging.info("Response status code successful for url: {} and status code: {}".format(url, 200)) - driver.close() - return doc - - except Exception as e: - logging.info( - "Exception occurred for url: {} and exception: {}".format(url, e) - ) - pass - - def get_xpath_data(self, doc, xpath): - try: - name = doc.xpath(xpath) - return name - - except Exception as e: - print("Error in getting {}: {}".format(name, e)) - pass - return None - - def slow_page_scroll(self, driver, speed): - current_scroll_position = driver.execute_script("return window.pageYOffset;") - while current_scroll_position < driver.execute_script( - "return document.body.scrollHeight;" - ): - driver.execute_script( - "window.scrollTo(0, arguments[0]);", current_scroll_position - ) - current_scroll_position += 1000 - time.sleep(speed) - - def data_storage(self, df_list, unique_id, name): - df_combined = pd.concat(df_list, ignore_index=True) - df_combined.drop_duplicates(subset=unique_id, inplace=True) - df_combined.to_csv( - self.storagePath + "raw/" + "{}_{}.csv".format(name, self.stamp), - index=False, - ) - - def cleanData(self, array): - array = [x.strip() for x in array] - array = list(filter(None, array)) - array = [x.encode("ascii", "ignore").decode() for x in array] - array = [x.replace("\n", "") for x in array] - return array - - \ No newline at end of file diff --git a/src/data_scraping/Amazon/scripts/main.py b/src/data_scraping/Amazon/scripts/main.py index 4ab4825..0829ad3 100644 --- a/src/data_scraping/Amazon/scripts/main.py +++ b/src/data_scraping/Amazon/scripts/main.py @@ -2,9 +2,7 @@ import os import logging from datetime import datetime -from dbConnector import AmazonDatabaseConnector from concurrent.futures import ThreadPoolExecutor -from genricHtmlib import SeleniumScraper from lxml import html import re from productList import product_categories @@ -16,19 +14,6 @@ class Scraper: def __init__(self): self.rival = "amazon" self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - self.storagePath = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "../" - ) - - logging.basicConfig( - filename=self.storagePath + "logs/{}_Scraper_{}.log".format(self.rival, self.stamp), - level=logging.INFO, - filemode="w", - ) - if not os.path.exists(self.storagePath + "logs"): - print(f"Creating logs folder at {self.storagePath + 'logs'}") - os.makedirs(self.storagePath + "logs") - self.url = "https://www.amazon.in/s?k={}&page={}&ref=sr_pg_{}" self.website = "https://www.amazon.in" self.productUrlXpath = '//*[@class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"]//@href' @@ -51,8 +36,6 @@ def getProducts(self, keyword, page): print(f"Found {len(productUrls)} products for product {keyword} on page {page}") pagination = SeleniumScraper.get_xpath_link(doc, self.paginationXpath, self.website) - # 1-48 of over 40,000 results for - # need 48 using regex pagination = re.findall(r'\d+', pagination[0]) self.pagination = int(pagination[1]) return productUrls diff --git a/src/eda/EDA.ipynb b/src/eda/EDA.ipynb deleted file mode 100644 index e2af788..0000000 --- a/src/eda/EDA.ipynb +++ /dev/null @@ -1,98 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The database amazon.db does not exist\n" - ] - }, - { - "ename": "SystemExit", - "evalue": "1", - "output_type": "error", - "traceback": [ - "An exception has occurred, use %tb to see the full traceback.\n", - "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 1\n" - ] - } - ], - "source": [ - "# connect with the database\n", - "import sqlite3\n", - "import os\n", - "import sys\n", - "import time\n", - "import datetime\n", - "sys.path.append(\"src\")\n", - "\n", - "def connect(db_name: str):\n", - " # check if the database exists\n", - " if not os.path.exists(db_name):\n", - " print(f\"The database {db_name} does not exist\")\n", - " sys.exit(1)\n", - " \n", - " conn = sqlite3.connect(db_name)\n", - " cursor = conn.cursor()\n", - " return cursor\n", - "\n", - "def generate_stats(database: str):\n", - " stats = {}\n", - " cursor = connect(database)\n", - " cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n", - " tables = cursor.fetchall()\n", - " stats[\"tables\"] = tables\n", - " print(f\"Tables in the database: {tables}\")\n", - " \n", - " for table in tables:\n", - " cursor.execute(f\"SELECT COUNT(*) FROM {table[0]}\")\n", - " rows = cursor.fetchall()\n", - " print(f\"Total rows in {table[0]}: {rows[0][0]}\")\n", - " stats[table[0]] = rows[0][0]\n", - " \n", - " for table in tables:\n", - " cursor.execute(f\"PRAGMA table_info({table[0]})\")\n", - " columns = cursor.fetchall()\n", - " print(f\"Columns in {table[0]}: {len(columns)}\")\n", - " stats[table[0]] = columns\n", - " \n", - " cursor.execute(\"SELECT COUNT(DISTINCT sku) FROM products\")\n", - " products = cursor.fetchall()\n", - " print(f\"Total unique products: {products[0][0]}\")\n", - " stats[\"products\"] = products[0][0]\n", - " \n", - " return stats\n", - " \n", - "\n", - "\n", - "generate_stats(\"amazon.db\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/src/eda/eda.py b/src/eda/auto_eda.py similarity index 58% rename from src/eda/eda.py rename to src/eda/auto_eda.py index 0579b98..4a2bad0 100644 --- a/src/eda/eda.py +++ b/src/eda/auto_eda.py @@ -1,4 +1,3 @@ -import sqlite3 import os import sys import time @@ -6,9 +5,6 @@ import pandas as pd import numpy as np -sys.path.append("src") -from utils import DatabaseUtils, LoggerUtil - class AutoEDA: """ This is a class to perform exploratory data analysis. @@ -36,4 +32,27 @@ class AutoEDA: def __init__(self, dataframe: pd.DataFrame): self.dataframe = dataframe + if not isinstance(self.dataframe, pd.DataFrame): + raise ValueError("Dataframe is not correct.") + + + def _generate_basic_stats(self): + """ + This method will generate basic statistics of the dataframe. + """ + stats = self.dataframe.describe() + + # analyze the data types of the dataframe and suggest the user to convert the data types. + data_types = self.dataframe.dtypes + data_types = data_types.reset_index() + data_types.columns = ['Feature', 'Data Type'] + data_types = data_types.groupby('Data Type').count() + data_types = data_types.reset_index() + data_types.columns = ['Data Type', 'Count'] + + return stats, data_types + + + + \ No newline at end of file diff --git a/src/eda/dataset.db b/src/eda/dataset.db deleted file mode 100644 index e69de29..0000000 diff --git a/src/utils/__init__.py b/src/utils/__init__.py index df4c01d..b436fe3 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -1,4 +1,3 @@ -from .database_util import DatabaseUtils from .logger_util import LoggerUtil -__all__ = ["DatabaseUtils", "LoggerUtil"] \ No newline at end of file +__all__ = ["LoggerUtil"] \ No newline at end of file diff --git a/src/utils/database_util.py b/src/utils/database_util.py deleted file mode 100644 index 5a2a575..0000000 --- a/src/utils/database_util.py +++ /dev/null @@ -1,30 +0,0 @@ -# connect with the database -import sqlite3 -import os -import sys -import time -import datetime - -class DatabaseUtils: - def __init__(self, database: str): - self.database = database - self.stats = {} - - def connect(self, db_name: str): - if not os.path.exists(db_name): - print(f"The database {db_name} does not exist") - sys.exit(1) - - conn = sqlite3.connect(db_name) - cursor = conn.cursor() - return cursor - - def generate_stats(self): - cursor = self.connect(self.database) - cursor.execute("SELECT COUNT(DISTINCT sku) FROM products") - products = cursor.fetchall() - self.stats["products"] = products[0][0] - return self.stats - - -