Skip to content

Commit

Permalink
imports clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
robjharrison committed May 31, 2024
1 parent 8317ef2 commit 42174ea
Showing 1 changed file with 15 additions and 19 deletions.
34 changes: 15 additions & 19 deletions ofsted_childrens_services_inspection_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,32 +58,26 @@
# url = url_stem + url_search_stem + '&level_1_types=' + str(search_category) + '&level_2_types=' + str(search_sub_category) + max_page_results_url




#
# Script admin settings


# Non-standard modules that might need installing
import os
import io
import requests
from requests.exceptions import RequestException # HTTP requests excep' class

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import io
import re
from datetime import datetime
import nltk
import json
import git

# pdf search/data extraction
try:
import tabula
import PyPDF2
except ModuleNotFoundError:
print("Please install 'tabula-py' and 'PyPDF2' using pip")
import nltk
nltk.download('punkt') # tokeniser models/sentence segmentation
nltk.download('stopwords') # stop words ready for text analysis|NLP preprocessing

# nlp stuff for sentiment
try:
Expand All @@ -92,6 +86,15 @@
except ModuleNotFoundError:
print("Please install 'textblob' and 'gensim' using pip")


# pdf search/data extraction
try:
import tabula
import PyPDF2
except ModuleNotFoundError:
print("Please install 'tabula-py' and 'PyPDF2' using pip")


# handle optional excel export+active file links
try:
import xlsxwriter
Expand Down Expand Up @@ -120,13 +123,6 @@
logging.basicConfig(filename='output.log', level=logging.INFO, format='%(asctime)s - %(message)s')


# text analysis libs
nltk.download('punkt') # tokeniser models/sentence segmentation
nltk.download('stopwords') # stop words ready for text analysis|NLP preprocessing





#
# Function defs
Expand Down

0 comments on commit 42174ea

Please sign in to comment.