Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(vax,chn): Reduce running time and still ensure data validity #2620

Merged
merged 3 commits into from
Apr 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/output/vaccinations/status/status-vax-get-ts.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2022-04-29T09:27:52
2022-04-29T13:16:29
94 changes: 87 additions & 7 deletions scripts/output/vaccinations/status/status-vax-get.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,91 @@
module,execution_time (sec),success,timestamp,error,error_short
cowidev.vax.batch.argentina,7.57,True,2022-04-29T09:26:35,,
cowidev.vax.batch.australia,2.08,True,2022-04-29T09:26:29,,
cowidev.vax.batch.austria,,,2022-04-29T09:26:27,,
cowidev.vax.batch.belgium,12.01,True,2022-04-29T09:26:39,,
cowidev.vax.batch.bolivia,2.57,True,2022-04-29T09:26:30,,
cowidev.vax.batch.canada,3.37,False,2022-04-29T09:27:48,"Traceback (most recent call last):
File ""/home/lucas/repos/covid-19-data/scripts/src/cowidev/cmd/commons/get.py"", line 47, in run
cowidev.vax.batch.argentina,2.97,True,2022-04-28T07:00:17,,
cowidev.vax.batch.australia,1.74,True,2022-04-28T07:00:16,,
cowidev.vax.batch.austria,,,2022-04-28T07:00:14,,
cowidev.vax.batch.belgium,4.43,True,2022-04-28T07:00:18,,
cowidev.vax.batch.bolivia,1.87,True,2022-04-28T07:00:16,,
cowidev.vax.batch.canada,1.08,True,2022-04-28T07:00:15,,
cowidev.vax.batch.chile,0.6,True,2022-04-28T08:28:39,,
cowidev.vax.batch.czechia,2.16,True,2022-04-28T07:00:18,,
cowidev.vax.batch.denmark,2.3,True,2022-04-28T07:00:18,,
cowidev.vax.batch.ecdc,76.45,True,2022-04-28T07:01:32,,
cowidev.vax.batch.ecuador,1.24,True,2022-04-28T07:00:18,,
cowidev.vax.batch.estonia,0.68,True,2022-04-28T07:00:19,,
cowidev.vax.batch.france,0.48,True,2022-04-28T07:00:19,,
cowidev.vax.batch.germany,0.18,True,2022-04-28T07:00:18,,
cowidev.vax.batch.greece,1.52,True,2022-04-28T07:00:20,,
cowidev.vax.batch.hong_kong,0.36,True,2022-04-28T11:31:52,,
cowidev.vax.batch.indonesia,7.61,True,2022-04-28T07:00:26,,
cowidev.vax.batch.ireland,0.43,True,2022-04-28T07:00:19,,
cowidev.vax.batch.israel,7.64,True,2022-04-28T07:00:27,,
cowidev.vax.batch.italy,6.44,True,2022-04-28T07:00:26,,
cowidev.vax.batch.jersey,0.55,True,2022-04-28T07:00:27,,
cowidev.vax.batch.latvia,18.8,True,2022-04-28T07:00:45,,
cowidev.vax.batch.lithuania,1.36,True,2022-04-28T07:00:28,,
cowidev.vax.batch.luxembourg,1.29,True,2022-04-28T07:00:28,,
cowidev.vax.batch.malaysia,0.49,True,2022-04-28T07:00:27,,
cowidev.vax.batch.malta,0.75,True,2022-04-28T07:00:28,,
cowidev.vax.batch.netherlands,0.56,True,2022-04-28T07:00:29,,
cowidev.vax.batch.new_zealand,19.61,True,2022-04-28T07:00:48,,
cowidev.vax.batch.norway,0.34,True,2022-04-28T07:00:28,,
cowidev.vax.batch.peru,3.5,True,2022-04-28T07:00:32,,
cowidev.vax.batch.portugal,0.44,True,2022-04-28T07:00:29,,
cowidev.vax.batch.romania,10.5,True,2022-04-28T07:00:39,,
cowidev.vax.batch.saudi_arabia,11.3,True,2022-04-28T07:00:43,,
cowidev.vax.batch.singapore,2.46,True,2022-04-28T07:00:42,,
cowidev.vax.batch.slovakia,49.86,True,2022-04-28T07:01:32,,
cowidev.vax.batch.slovenia,1.36,True,2022-04-28T07:00:44,,
cowidev.vax.batch.south_korea,40.65,True,2022-04-28T07:01:25,,
cowidev.vax.batch.spc,15.57,True,2022-04-28T07:01:01,,
cowidev.vax.batch.sweden,9.86,True,2022-04-28T07:00:57,,
cowidev.vax.batch.switzerland,25.8,True,2022-04-28T07:01:23,,
cowidev.vax.batch.trinidad_and_tobago,1.23,True,2022-04-28T07:01:02,,
cowidev.vax.batch.ukraine,,,2022-04-28T07:01:02,,
cowidev.vax.batch.united_kingdom,5.45,True,2022-04-28T07:01:07,,
cowidev.vax.batch.united_states,29.01,True,2022-04-28T07:01:36,,
cowidev.vax.batch.uruguay,3.35,True,2022-04-28T07:01:27,,
cowidev.vax.batch.zimbabwe,0.77,True,2022-04-28T07:01:26,,
cowidev.vax.incremental.africacdc,,,2022-04-28T07:01:26,,
cowidev.vax.incremental.albania,6.48,True,2022-04-28T07:01:32,,
cowidev.vax.incremental.antigua_barbuda,4.66,True,2022-04-28T07:01:31,,
cowidev.vax.incremental.aruba,1.09,True,2022-04-28T07:01:32,,
cowidev.vax.incremental.azerbaijan,6.74,True,2022-04-28T07:01:39,,
cowidev.vax.incremental.bahrain,0.94,True,2022-04-28T07:01:33,,
cowidev.vax.incremental.bangladesh,10.86,True,2022-04-28T07:01:43,,
cowidev.vax.incremental.barbados,2.17,True,2022-04-28T07:01:35,,
cowidev.vax.incremental.brazil,0.04,True,2022-04-28T07:01:33,,
cowidev.vax.incremental.bulgaria,0.77,True,2022-04-28T07:01:34,,
cowidev.vax.incremental.china,29.39,True,2022-04-29T13:16:29,,
cowidev.vax.incremental.colombia,1.39,True,2022-04-28T07:01:36,,
cowidev.vax.incremental.costa_rica,1.79,True,2022-04-28T07:01:38,,
cowidev.vax.incremental.croatia,0.12,True,2022-04-28T07:01:36,,
cowidev.vax.incremental.cuba,2.17,True,2022-04-28T07:01:39,,
cowidev.vax.incremental.curacao,0.77,True,2022-04-28T07:01:39,,
cowidev.vax.incremental.cyprus,1.01,True,2022-04-28T07:01:40,,
cowidev.vax.incremental.dominican_republic,5.97,True,2022-04-28T07:01:44,,
cowidev.vax.incremental.el_salvador,0.74,True,2022-04-28T07:01:39,,
cowidev.vax.incremental.equatorial_guinea,2.11,True,2022-04-28T07:01:41,,
cowidev.vax.incremental.faeroe_islands,0.3,True,2022-04-28T07:01:40,,
cowidev.vax.incremental.fiji,0.0,True,2022-04-28T07:01:40,,
cowidev.vax.incremental.finland,0.21,True,2022-04-28T07:01:40,,
cowidev.vax.incremental.gabon,,,2022-04-28T07:01:40,,
cowidev.vax.incremental.georgia,0.38,True,2022-04-28T07:01:40,,
cowidev.vax.incremental.greenland,1.03,True,2022-04-28T07:01:41,,
cowidev.vax.incremental.guatemala,,,2022-04-28T07:01:41,,
cowidev.vax.incremental.guernsey,0.13,True,2022-04-28T07:01:42,,
cowidev.vax.incremental.hungary,1.09,True,2022-04-28T07:01:43,,
cowidev.vax.incremental.iceland,0.21,True,2022-04-28T07:01:42,,
cowidev.vax.incremental.india,1.05,True,2022-04-28T07:01:43,,
cowidev.vax.incremental.iran,2.47,True,2022-04-28T07:01:45,,
cowidev.vax.incremental.isle_of_man,0.8,True,2022-04-28T07:01:44,,
cowidev.vax.incremental.jamaica,1.44,True,2022-04-28T07:01:45,,
cowidev.vax.incremental.japan,1.07,True,2022-04-28T07:01:45,,
cowidev.vax.incremental.kazakhstan,7.06,True,2022-04-28T07:01:52,,
cowidev.vax.incremental.kosovo,1.98,True,2022-04-28T07:01:47,,
cowidev.vax.incremental.kyrgyzstan,6.19,True,2022-04-28T07:01:51,,
cowidev.vax.incremental.laos,3.1,True,2022-04-28T07:01:48,,
cowidev.vax.incremental.lebanon,0.39,False,2022-04-28T07:04:50,"Traceback (most recent call last):
File ""/mnt/owid_live_covid/covid-19-data/scripts/src/cowidev/cmd/commons/get.py"", line 47, in run
module.main()
File ""/home/lucas/repos/covid-19-data/scripts/src/cowidev/vax/batch/canada.py"", line 120, in main
Canada().export()
Expand Down
17 changes: 9 additions & 8 deletions scripts/src/cowidev/vax/incremental/china.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class China(CountryVaxBase):
"vaccinated": r"(?:接种|疫苗)的?总人数(?:达到?|为)([\d\.亿零]+万)",
"boosters": r"加强免疫(?:已经)?接种的?是?([\d\.亿零]+万)人",
}
num_links_complete = 6
num_links_complete = 3
timeout = 30

def read(self, last_update: str):
Expand All @@ -37,21 +37,20 @@ def read(self, last_update: str):
driver.get(self.source_url)
time.sleep(random.randint(1, 2))
Wait(driver, self.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "zxxx_list")))
time.sleep(random.randint(2, 3))
driver.execute_script("window.stop();")
links = self._get_links(driver)
for link in links:
data_ = self._parse_data(driver, link)
if data_["date"] <= last_update:
break
data.append(data_)
assert data_["date"] <= last_update, "Only read data back to: " + data_["date"]
return pd.DataFrame(data)

def _parse_data(self, driver, url):
driver.get(url)
time.sleep(random.randint(1, 2))
Wait(driver, self.timeout).until(EC.presence_of_element_located((By.ID, "xw_box")))
time.sleep(random.randint(2, 3))
Wait(driver, self.timeout).until(EC.text_to_be_present_in_element((By.ID, "xw_box"), "万剂次"))
driver.execute_script("window.stop();")
elem = driver.find_element_by_id("xw_box")
return {
Expand All @@ -72,11 +71,14 @@ def read_complete(self):
driver.get(self.source_url_complete)
time.sleep(random.randint(1, 2))
Wait(driver, self.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "zxxx_list")))
time.sleep(random.randint(2, 3))
driver.execute_script("window.stop();")
links = self._get_links_complete(driver)
for link in links[: self.num_links_complete]:
record = self._parse_data_complete(driver, link)
try:
record = self._parse_data_complete(driver, link)
except:
print("Failed to parse:", link)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know whether "print" is a right statement to use in this project when I want to output an acceptable error.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want the pipeline to be aware that something is not working for a country, we simply raise Exceptions.

So here I would use

except:
  raise Exception(f"Failed to parse {link}")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lucasrodes Thanks! Since the only possible exception in the subroutine is TimeoutException (in two different locations), I will cancel the try-except and let them happen in the subroutine. See #2621.

continue
if record:
records.append(record)
return pd.DataFrame(records)
Expand All @@ -95,8 +97,7 @@ def _clean_count(num_as_str):

driver.get(url)
time.sleep(random.randint(1, 2))
Wait(driver, self.timeout).until(EC.presence_of_element_located((By.ID, "xw_box")))
time.sleep(random.randint(2, 3))
Wait(driver, self.timeout).until(EC.text_to_be_present_in_element((By.ID, "xw_box"), "到此结束"))
driver.execute_script("window.stop();")
elem = driver.find_element_by_id("xw_box")
# Apply regex
Expand Down