-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_cwb_rain.py
130 lines (113 loc) · 4.19 KB
/
data_cwb_rain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 24 10:01:59 2016
@author: wpk
"""
from datetime import datetime, timedelta
import requests
#import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
from pandas.io import sql
#from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()
import MySQLdb
def scrapecwb():
# Scrape the HTML at the url
baseurl = "http://www.cwb.gov.tw/V7/observe/rainfall/Rain_Hr/22.htm"
r = requests.get(baseurl)
r.encoding = "utf-8"
# Turn the HTML into a Beautiful Soup object
# soup = BeautifulSoup(r.text, "html.parser")
soup = BeautifulSoup(r.text, "lxml")
# soup.prettify()
# code = r.encoding
# soup = BeautifulSoup(r.text.encode(code).decode("utf-8"), "lxml")
# print soup
# Create an object of the first object that is class=Form00
table = soup.find("table", {"class": "tablesorter"})
# print table
lo = []
st = []
titleid = []
time = []
rf = []
# Find all the <tr> tag pairs, skip the first one, then for each.
for row in table.find_all('tr')[1:]:
# Create a variable of all the <td> tag pairs in each <tr> tag pair,
col = row.find_all('td')
# print len(col)
for tds in range(2, 26):
# print tds
location = col[0].text
lo.append(location)
title = col[1].span["title"]
station = title[5:]
st.append(station)
titleid.append(title[:5])
# print col[tds].string
now = datetime.now() - timedelta(hours=tds - 1)
stntime = now.strftime("%Y-%m-%d %H:00")
time.append(stntime)
if col[tds].string == "X":
rf.append("null")
elif col[tds].string == "-":
rf.append("0")
elif float(col[tds].string) > 0:
rf.append(float(col[tds].string.strip()))
# Create a variable of the value of the columns
coldata = {'id': titleid, 'time': time, 'rf': rf}
stdata = {'lo': lo, 'st': st, 'id': titleid, 'time': time, 'rf': rf}
# print coldata
# Create a dataframe from the columns variable
df_cwb = pd.DataFrame(coldata, columns=['id', 'time', 'rf'])
stid_rain = pd.DataFrame(
stdata, columns=['lo', 'st', 'id']).drop_duplicates()
return df_cwb, stid_rain
# print df_cwb
def exinscwb2db():
now = datetime.now()
nowtime = now.strftime("%Y-%m-%d %H:00")
# print nowtime
last24 = datetime.now() - timedelta(hours=24)
last24time = last24.strftime("%Y-%m-%d %H:00")
# print last24time
selectsqlquery = "SELECT * FROM cwb_rainfall_data_temp WHERE " \
"time >= '" + nowtime + "' and time <= '" + last24time + "'"
deletesqlquery = "DELETE FROM cwb_rainfall_data_temp WHERE " \
"time >= '" + last24time + "' and time <= '" + nowtime + "'"
# print deletesqlquery
# db = create_engine('mysql://ncree_gcclab_adm:[email protected]/ggclab')
# db = create_engine('mysql://root:FCJ,[email protected]/ggclab')
db = MySQLdb.connect(host="140.109.80.146",
user="****",
passwd="****",
use_unicode=True,
charset="utf8",
db="IES")
# db.encoding = "utf-8"
df_mysql = pd.read_sql_query(selectsqlquery, db)
[df_cwb, stid_rain] = scrapecwb()
frames = [df_mysql, df_cwb]
df_result = pd.concat(frames, ignore_index=True,
verify_integrity=True, names='time')
# print df_result
df_result = df_result.groupby(['id', 'time', 'rf']).size().reset_index()
del df_result[0]
df_result = df_result.ix[:, 0:]
sql.execute(deletesqlquery, db)
sql.to_sql(df_result, con=db,
name='cwb_rainfall_data',
index=None,
if_exists='append',
flavor='mysql')
sql.to_sql(stid_rain, con=db,
name='cwb_rainfall_stid',
index=None,
if_exists='replace',
flavor='mysql')
print("writing cwb_rainfall data to DB.....")
print("writing cwb_rainfall_stid to DB.....")
#[df_cwb,stid_rain] = scrapecwb()
exinscwb2db()