Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Connection Error Fix #607

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions tensorboardX/event_file_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,18 @@
# ==============================================================================
"""Writes events to disk in a logdir."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import, division, print_function

import logging
import multiprocessing
import os
import socket
import threading
import time
import multiprocessing

import six
from botocore.exceptions import EndpointConnectionError
from requests import ConnectionError

from .proto import event_pb2
from .record_writer import RecordWriter, directory_check
Expand Down Expand Up @@ -193,6 +195,7 @@ def run(self):
# time to flush the writer, whichever is earlier. If we have an
# data, write it. If not, an empty queue exception will be raised
# and we can proceed to flush the writer.
connection = True
while True:
now = time.time()
queue_wait_duration = self._next_flush_time - now
Expand All @@ -216,7 +219,22 @@ def run(self):
# Small optimization - if there are no pending data,
# there's no need to flush, since each flush can be
# expensive (e.g. uploading a new file to a server).
self._record_writer.flush()
# The try-catch statement below will prevent the thread
# from hanging when connection gets dropped either by GCS
# or S3 client.
try:
self._record_writer.flush()
if not connection:
logging.debug("Connection established.")
connection = True
except (ConnectionError, ConnectionAbortedError) as e:
if connection:
logging.debug(
"Connection lost, trying to "
"reestablish the connection."
)
connection = False
continue
self._has_pending_data = False
# Do it again in flush_secs.
self._next_flush_time = now + self._flush_secs