Skip to content

Commit

Permalink
Merge pull request #349 from mattjala/rescan_loop_fix
Browse files Browse the repository at this point in the history
Fix timestamp inaccuracy during domain scan
  • Loading branch information
mattjala authored Apr 18, 2024
2 parents 5c24172 + ade4b9d commit 8e395e6
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 36 deletions.
23 changes: 21 additions & 2 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ jobs:
USER2_PASSWORD: test
HSDS_ENDPOINT: http://127.0.0.1:5101
BUCKET_NAME: hsdstest
LOG_LEVEL: DEBUG
run: |
mkdir data
mkdir data/hsdstest
Expand All @@ -83,6 +84,7 @@ jobs:
USER2_PASSWORD: test
HSDS_ENDPOINT: http://127.0.0.1:5101
BUCKET_NAME: hsdstest
LOG_LEVEL: DEBUG
run: |
mkdir data
mkdir data/hsdstest
Expand All @@ -98,6 +100,7 @@ jobs:
- name: Run HSDS tests
if: ${{!(matrix.build-method == 'docker' && matrix.os == 'windows-latest')}}
id: hsds-tests
shell: bash
env:
ADMIN_PASSWORD: admin
Expand All @@ -107,9 +110,25 @@ jobs:
USER2_NAME: test_user2
USER2_PASSWORD: test
BUCKET_NAME: hsdstest
LOG_LEVEL: DEBUG
run: |
pytest tests/integ/setup_test.py
pytest tests/integ
python testall.py
- name: Show HSDS Logs on Fail (Docker)
# Only run if the whole workflow failed due to HSDS tests
if: ${{failure() && steps.hsds-tests.outcome == 'failure' && (matrix.build-method == 'docker' && matrix.os != 'windows-latest')}}
run: |
docker logs hsds-sn-1 >&1
docker logs hsds-dn-1 >&1
docker logs hsds-dn-2 >&1
docker logs hsds-dn-3 >&1
docker logs hsds-dn-4 >&1
- name: Show HSDS Logs on Fail (Manual)
# Only run if the whole workflow failed due to HSDS tests
if: ${{failure() && steps.hsds-tests.outcome == 'failure' && (matrix.build-method == 'manual')}}
run: |
cat hs.log
- name: Checkout h5pyd
if: ${{ ( matrix.os != 'windows-latest' ) }}
Expand Down
3 changes: 2 additions & 1 deletion admin/config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ max_tcp_connections: 100 # max number of inflight tcp connections
head_sleep_time: 10 # max sleep time between health checks for head node
node_sleep_time: 10 # max sleep time between health checks for SN/DN nodes
async_sleep_time: 1 # max sleep time between async task runs
scan_sleep_time: 10 # max sleep time between scaning runs
scan_sleep_time: 10 # max sleep time between scanning runs
scan_wait_time: 10 # min time to wait after a domain update before starting a scan
max_scan_duration: 180 # max time to wait for a scan to complete before raising error
gc_sleep_time: 10 # max time between runs to delete unused objects
s3_sync_interval: 1 # time to wait between s3_sync checks (in sec)
s3_age_time: 1 # time to wait since last update to write an object to S3
Expand Down
2 changes: 1 addition & 1 deletion hsds/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _load_cfg():
override = yml_override[x]
debug(f"got config override for {x}")

if override is not None:
if override is not None and override != "":
if cfgval is not None:
try:
# convert to same type as yaml
Expand Down
12 changes: 8 additions & 4 deletions hsds/domain_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,19 +902,23 @@ async def PUT_Domain(request):
if bucket:
post_params["bucket"] = bucket
req_send_time = getNow(app)
log.debug(f"Sending rescan request at time {req_send_time}")
await http_post(app, notify_req, data={}, params=post_params)

# Poll until the scan_complete time is greater than
# req_send_time or 3 minutes have elapsed
MAX_WAIT_TIME = 180
max_scan_duration = int(config.get("max_scan_duration", default=180))
RESCAN_SLEEP_TIME = 0.1

while True:
scan_time = await getScanTime(app, root_id, bucket=bucket)
if scan_time > req_send_time:
log.debug(f"Most recent scan on domain {root_id} completed at time {scan_time}")
if scan_time >= req_send_time:
log.info(f"scan complete for root: {root_id}")
break
if getNow(app) - req_send_time > MAX_WAIT_TIME:
log.warn(f"scan failed to complete in {MAX_WAIT_TIME} seconds for {root_id}")
if getNow(app) - req_send_time > max_scan_duration:
log.warn(f"scan failed to complete in {max_scan_duration}\
seconds for {root_id}")
raise HTTPServiceUnavailable()
log.debug(f"do_rescan sleeping for {RESCAN_SLEEP_TIME}s")
await asyncio.sleep(RESCAN_SLEEP_TIME) # avoid busy wait
Expand Down
15 changes: 14 additions & 1 deletion hsds/util/timeUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
##############################################################################
from datetime import datetime
import time
import os
import pytz


Expand Down Expand Up @@ -67,4 +68,16 @@ def getNow(app):
Returns a precise timestamp even on platforms where
time.time() has low resolution (e.g. Windows)
"""
return (time.perf_counter() - app["start_time_relative"]) + app["start_time"]
system = os.name
current_time = 0

if system == "nt":
# Windows
current_time = (time.perf_counter() - app["start_time_relative"]) + app["start_time"]
elif system == "posix":
# Unix
current_time = time.time()
else:
raise ValueError(f"Unsupported OS: {system}")

return current_time
22 changes: 0 additions & 22 deletions tests/integ/broadcast_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,28 +36,6 @@ def getRootUUID(self, domain, username=None, password=None):
domain, username=username, password=password, session=self.session
)

def checkVerbose(self, dset_id, headers=None, expected=None):
# do a flush with rescan, then check the expected return values are correct
req = f"{self.endpoint}/"
params = {"flush": 1, "rescan": 1}
rsp = self.session.put(req, params=params, headers=headers)
# should get a NO_CONTENT code,
self.assertEqual(rsp.status_code, 204)

# do a get and verify the additional keys are
req = f"{self.endpoint}/datasets/{dset_id}"
params = {"verbose": 1}

rsp = self.session.get(req, params=params, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)

for k in expected:
self.assertTrue(k in rspJson)
self.assertEqual(rspJson[k], expected[k])

# main

def testPut1DDataset(self):
# Test PUT value with broadcast for 1d dataset
print("testPut1DDataset", self.base_domain)
Expand Down
14 changes: 9 additions & 5 deletions tests/integ/value_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,17 @@ def getRootUUID(self, domain, username=None, password=None):

def checkVerbose(self, dset_id, headers=None, expected=None):
# do a flush with rescan, then check the expected return values are correct
num_retries = 5
req = f"{self.endpoint}/"
params = {"flush": 1, "rescan": 1}
rsp = self.session.put(req, params=params, headers=headers)
# should get a NO_CONTENT code,
self.assertEqual(rsp.status_code, 204)
for i in range(num_retries):
rsp = self.session.put(req, params=params, headers=headers)
if (rsp.status_code == 503):
# Retry
continue
# should get a NO_CONTENT code
self.assertEqual(rsp.status_code, 204)
break

# do a get and verify the additional keys are
req = f"{self.endpoint}/datasets/{dset_id}"
Expand All @@ -58,8 +64,6 @@ def checkVerbose(self, dset_id, headers=None, expected=None):
self.assertTrue(k in rspJson)
self.assertEqual(rspJson[k], expected[k])

# main

def testPut1DDataset(self):
# Test PUT value for 1d dataset
print("testPut1DDataset", self.base_domain)
Expand Down

0 comments on commit 8e395e6

Please sign in to comment.