Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable selection of GPU on inference page #1511

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion digits/inference/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class InferenceJob(Job):
A Job that exercises the forward pass of a neural network
"""

def __init__(self, model, images, epoch, layers, resize=True, **kwargs):
def __init__(self, model, images, epoch, layers, resize=True, gpu=None, **kwargs):
"""
Arguments:
model -- job object associated with model to perform inference on
Expand All @@ -40,6 +40,7 @@ def __init__(self, model, images, epoch, layers, resize=True, **kwargs):
epoch=epoch,
layers=layers,
resize=resize,
gpu=gpu
))

@override
Expand Down
6 changes: 4 additions & 2 deletions digits/inference/tasks/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class InferenceTask(Task):
A task for inference jobs
"""

def __init__(self, model, images, epoch, layers, resize, **kwargs):
def __init__(self, model, images, epoch, layers, resize, gpu=None, **kwargs):
"""
Arguments:
model -- trained model to perform inference on
Expand All @@ -40,7 +40,7 @@ def __init__(self, model, images, epoch, layers, resize, **kwargs):
self.inference_log_file = "inference.log"

# resources
self.gpu = None
self.gpu = gpu

# generated data
self.inference_data_filename = None
Expand Down Expand Up @@ -180,6 +180,8 @@ def offer_resources(self, resources):
if resources[gpu_key]:
for resource in resources[gpu_key]:
if resource.remaining() >= 1:
if self.gpu is not None and self.gpu != int(resource.identifier):
continue
self.gpu = int(resource.identifier)
reserved_resources[gpu_key] = [(resource.identifier, 1)]
break
Expand Down
17 changes: 17 additions & 0 deletions digits/model/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,23 @@ def validate_custom_network_snapshot(form, field):
tooltip="The job won't start until all of the chosen GPUs are available."
)

# Select 1 of several GPUs
select_one_of_gpus = utils.forms.SelectField(
'Select which GPU you would like to use',
choices=[('next', 'Next available')] + [(
index,
'#%s - %s (%s memory)' % (
index,
get_device(index).name,
sizeof_fmt(
get_nvml_info(index)['memory']['total']
if get_nvml_info(index) and 'memory' in get_nvml_info(index)
else get_device(index).totalGlobalMem)
),
) for index in config_value('gpu_list').split(',') if index],
default='next',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens here if there are no GPUs on the system?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lukeyeager the same behavior that you'd get if you were to start a training job without a GPU:

  1. If all setup was made with CPU only, you'd get the same page you currently get (without the multi-gpu form)
  2. If you manage to have Caffe compiled with CUDA and has no GPU on the system (suppose you removed it afterwards -- I tested it masking all my GPUs using an invalid id on CUDA_VISIBLE_DEVICES), then you'd get the page without multi-gpu form and your job will fail exactly as the train job fails currently:
Check failed: error == cudaSuccess (38 vs. 0)  no CUDA-capable device is detected

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds reasonable to me!

)

# XXX For testing
# The Flask test framework can't handle SelectMultipleFields correctly
select_gpus_list = wtforms.StringField('Select which GPU[s] you would like to use (comma separated)')
Expand Down
17 changes: 13 additions & 4 deletions digits/model/images/classification/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from digits.pretrained_model.job import PretrainedModelJob
from digits.status import Status
from digits.utils import filesystem as fs
from digits.utils.forms import fill_form_if_cloned, save_form_to_job
from digits.utils.forms import fill_form_if_cloned, save_form_to_job, get_selected_gpu
from digits.utils.routing import request_wants_json, job_from_request
from digits.webapp import scheduler

Expand Down Expand Up @@ -333,8 +333,10 @@ def show(job, related_jobs=None):
"""
Called from digits.model.views.models_show()
"""
form = ImageClassificationModelForm()
return flask.render_template(
'models/images/classification/show.html',
form=form,
job=job,
framework_ids=[
fw.get_id()
Expand Down Expand Up @@ -384,14 +386,17 @@ def classify_one():
if 'show_visualizations' in flask.request.form and flask.request.form['show_visualizations']:
layers = 'all'

selected_gpu = get_selected_gpu(flask.request.form)

# create inference job
inference_job = ImageInferenceJob(
username=utils.auth.get_username(),
name="Classify One Image",
model=model_job,
images=[image_path],
epoch=epoch,
layers=layers
layers=layers,
gpu=selected_gpu
)

# schedule tasks
Expand Down Expand Up @@ -477,6 +482,7 @@ def classify_many():
epoch = float(flask.request.form['snapshot_epoch'])

paths, ground_truths = read_image_list(image_list, image_folder, num_test_images)
selected_gpu = get_selected_gpu(flask.request.form)

# create inference job
inference_job = ImageInferenceJob(
Expand All @@ -485,7 +491,8 @@ def classify_many():
model=model_job,
images=paths,
epoch=epoch,
layers='none'
layers='none',
gpu=selected_gpu
)

# schedule tasks
Expand Down Expand Up @@ -633,6 +640,7 @@ def top_n():
num_test_images = None

paths, _ = read_image_list(image_list, image_folder, num_test_images)
selected_gpu = get_selected_gpu(flask.request.form)

# create inference job
inference_job = ImageInferenceJob(
Expand All @@ -641,7 +649,8 @@ def top_n():
model=model_job,
images=paths,
epoch=epoch,
layers='none'
layers='none',
gpu=selected_gpu
)

# schedule tasks
Expand Down
16 changes: 15 additions & 1 deletion digits/model/images/generic/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from digits.status import Status
from digits.utils import filesystem as fs
from digits.utils import constants
from digits.utils.forms import fill_form_if_cloned, save_form_to_job
from digits.utils.forms import fill_form_if_cloned, save_form_to_job, get_selected_gpu
from digits.utils.routing import request_wants_json, job_from_request
from digits.webapp import scheduler

Expand Down Expand Up @@ -309,8 +309,10 @@ def show(job, related_jobs=None):
template, context = extension.get_inference_template(form)
inference_form_html = flask.render_template_string(template, **context)

generic_form = GenericImageModelForm()
return flask.render_template(
'models/images/generic/show.html',
form=generic_form,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a bit uneasy about passing the form here when all you need is the list of GPUs. Wouldn't it more explicit and self-explanatory to pass the list instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. Do you think this way is okay?

job=job,
view_extensions=view_extensions,
related_jobs=related_jobs,
Expand Down Expand Up @@ -361,6 +363,8 @@ def infer_one():
else:
resize = True

selected_gpu = get_selected_gpu(flask.request.form)

# create inference job
inference_job = ImageInferenceJob(
username=utils.auth.get_username(),
Expand All @@ -370,6 +374,7 @@ def infer_one():
epoch=epoch,
layers=layers,
resize=resize,
gpu=selected_gpu
)

# schedule tasks
Expand Down Expand Up @@ -446,6 +451,8 @@ def infer_extension():
if 'show_visualizations' in flask.request.form and flask.request.form['show_visualizations']:
layers = 'all'

selected_gpu = get_selected_gpu(flask.request.form)

# create inference job
inference_job = ImageInferenceJob(
username=utils.auth.get_username(),
Expand All @@ -455,6 +462,7 @@ def infer_extension():
epoch=epoch,
layers=layers,
resize=False,
gpu=selected_gpu
)

# schedule tasks
Expand Down Expand Up @@ -539,6 +547,8 @@ def infer_db():
else:
resize = True

selected_gpu = get_selected_gpu(flask.request.form)

# create inference job
inference_job = ImageInferenceJob(
username=utils.auth.get_username(),
Expand All @@ -548,6 +558,7 @@ def infer_db():
epoch=epoch,
layers='none',
resize=resize,
gpu=selected_gpu
)

# schedule tasks
Expand Down Expand Up @@ -633,6 +644,8 @@ def infer_many():
else:
resize = True

selected_gpu = get_selected_gpu(flask.request.form)

paths = []

for line in image_list.readlines():
Expand Down Expand Up @@ -664,6 +677,7 @@ def infer_many():
epoch=epoch,
layers='none',
resize=resize,
gpu=selected_gpu
)

# schedule tasks
Expand Down
39 changes: 33 additions & 6 deletions digits/templates/models/images/classification/show.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

{% extends "job.html" %}
{% from "helper.html" import serve_file %}
{% from "helper.html" import mark_errors %}

{% block job_content %}

<script src="{{ url_for('static', filename='js/model-graphs.js', ver=dir_hash) }}"></script>

{% set task = job.train_task() %}
{% set show_multi_gpu_form = form.select_one_of_gpus.choices| length > 2 %}

<div class="row">
<div class="col-sm-6">
Expand Down Expand Up @@ -102,13 +104,9 @@ <h4 class='text-center'>Dataset</h4>
{% endif %}
>
<h2>Trained Models</h2>
<div class="row">
<div class="col-sm-12">
<label for="snapshot_epoch">Select Model</label>
</div>
</div>
<div class="row">
<div class="col-sm-6">
<label for="snapshot_epoch">Select Model</label>
<div class="form-group">
<select id="snapshot_epoch" name="snapshot_epoch" class="form-control">
</select>
Expand Down Expand Up @@ -139,9 +137,36 @@ <h2>Trained Models</h2>
updateSnapshotList({% autoescape false %}{{task.snapshot_list()}}{% endautoescape %});
</script>
</div>
{% if show_multi_gpu_form %}
<button
formaction="{{url_for('digits.model.views.download', job_id=job.id())}}"
formmethod="post"
formenctype="multipart/form-data"
class="btn btn-info">
Download Model
</button>
<button
formaction="{{url_for('digits.model.views.to_pretrained', job_id=job.id())}}"
formmethod="post"
formenctype="multipart/form-data"
class="btn btn-success">
Make Pretrained Model
</button>
{% endif %}
</div>
{% if show_multi_gpu_form %}
<div class="col-sm-6">
<div class="form-group{{mark_errors([form.select_one_of_gpus])}}">
{{form.select_one_of_gpus.label}}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's indeed better to use the WTForm fields here but it's inconsistent with the rest of the file... so I'm not sure about that...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed that too. What you think?

{{form.select_one_of_gpus(class='form-control', size=4)}}
</div>
</div>
{% else %}
<div class="col-sm-6">
<button
<label for="empty space">&nbsp;</label>
</div>
<div class="col-sm-6">
<button
formaction="{{url_for('digits.model.views.download', job_id=job.id())}}"
formmethod="post"
formenctype="multipart/form-data"
Expand All @@ -156,6 +181,7 @@ <h2>Trained Models</h2>
Make Pretrained Model
</button>
</div>
{% endif %}
</div>
{% if task.get_framework_id() in framework_ids %}
<div class="row">
Expand Down Expand Up @@ -184,6 +210,7 @@ <h3>Test a single image</h3>
</div>
</div>
</div>

<script type="text/javascript">
// When you fill in one field, the other gets blanked out
$("#image_path").change(function() { $("#image_file").val(""); });
Expand Down
40 changes: 33 additions & 7 deletions digits/templates/models/images/generic/show.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

{% extends "job.html" %}
{% from "helper.html" import serve_file %}
{% from "helper.html" import mark_errors %}

{% block job_content %}

<script src="{{ url_for('static', filename='js/model-graphs.js', ver=dir_hash) }}"></script>

{% set task = job.train_task() %}
{% set show_multi_gpu_form = form.select_one_of_gpus.choices| length > 2 %}

<div class="row">
<div class="col-sm-6">
Expand Down Expand Up @@ -97,12 +99,8 @@ <h4 class='text-center'>Dataset</h4>
>
<h2>Trained Models</h2>
<div class="row">
<div class="col-sm-12">
<div class="col-sm-6">
<label for="snapshot_epoch">Select Model</label>
</div>
</div>
<div class="row">
<div class="col-sm-8">
<div class="form-group">
<select id="snapshot_epoch" name="snapshot_epoch" class="form-control">
</select>
Expand Down Expand Up @@ -133,9 +131,36 @@ <h2>Trained Models</h2>
updateSnapshotList({% autoescape false %}{{task.snapshot_list()}}{% endautoescape %});
</script>
</div>
</div>
<div class="col-sm-4">
{% if show_multi_gpu_form %}
<button
formaction="{{url_for('digits.model.views.download', job_id=job.id())}}"
formmethod="post"
formenctype="multipart/form-data"
class="btn btn-info">
Download Model
</button>
<button
formaction="{{url_for('digits.model.views.to_pretrained', job_id=job.id())}}"
formmethod="post"
formenctype="multipart/form-data"
class="btn btn-success">
Make Pretrained Model
</button>
{% endif %}
</div>
{% if show_multi_gpu_form %}
<div class="col-sm-6">
<div class="form-group{{mark_errors([form.select_one_of_gpus])}}">
{{form.select_one_of_gpus.label}}
{{form.select_one_of_gpus(class='form-control', size=4)}}
</div>
</div>
{% else %}
<div class="col-sm-6">
<label for="empty space">&nbsp;</label>
</div>
<div class="col-sm-6">
<button
formaction="{{url_for('digits.model.views.download', job_id=job.id())}}"
formmethod="post"
formenctype="multipart/form-data"
Expand All @@ -150,6 +175,7 @@ <h2>Trained Models</h2>
Make Pretrained Model
</button>
</div>
{% endif %}
</div>
<div class="row">
<div class="col-sm-6">
Expand Down
Loading