Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Xccl group #2

Closed
wants to merge 1,085 commits into from
Closed
Show file tree
Hide file tree
Changes from 58 commits
Commits
Show all changes
1085 commits
Select commit Hold shift + click to select a range
062681a
[Profiler] Torch Profiler distributed info is not JSON serializable (…
sraikund16 Sep 13, 2024
1c04cbf
[BE] Use `C10_UNUSED` (#135914)
malfet Sep 13, 2024
e6b6835
Fix xpu memory stats error (#135818)
guangyey Sep 12, 2024
0ad5677
support allgather_into_tensor_coalesced
Chao1Han Sep 13, 2024
6cdc70b
[ROCm] skip test_fp8_cast_and_t on non-MI300 machines (#135917)
pragupta Sep 13, 2024
0cdc6a8
[DSD] Fix distributed state dict full_state_dict option hang during s…
wz337 Sep 12, 2024
6df91b5
real tensor prop for composite ops (#135717)
pianpwk Sep 13, 2024
eea5e6f
[DCP][DSD] Add a test case to demonstrate the workaround to load full…
wz337 Sep 12, 2024
e54b559
[inductor] More fixes on the keys of `constants` and `signature` dict…
Jokeren Sep 13, 2024
b38be72
[Inductor UT] Generalize inductor UT for intel GPU (Part 2) (#134556)
hoshibara Sep 13, 2024
9fd54d7
[Inductor UT] Generalize device-bias code in test_triton_kernels.py i…
etaf Sep 12, 2024
009e334
support reduce_scatter
Chao1Han Sep 13, 2024
7dc1788
[inductor] Remove the batch fusion passes from being a default (#135922)
anijain2305 Sep 13, 2024
ecbd989
refine test cases
Chao1Han Sep 13, 2024
a23ffb2
update ut
Chao1Han Sep 13, 2024
1d02dfe
add mpi check
Chao1Han Sep 13, 2024
c485bd8
update datatype map
Chao1Han Sep 13, 2024
2d1ae87
update
Chao1Han Sep 13, 2024
04226de
Merge branch 'xccl' into xccl-group
Chao1Han Sep 13, 2024
6184261
update
Chao1Han Sep 13, 2024
b346e99
remove fast_flush arguments (#135387)
int3 Sep 13, 2024
e504fb7
[Dynamo] Use custom backend to reenter metadata tf mode when tracing …
mlazos Sep 11, 2024
fafdd58
[Dynamo] Trace torch function modes entered outside of torch.compile …
mlazos Sep 12, 2024
30b007b
[Dynamo] Support thread local setattr (#135443)
mlazos Sep 12, 2024
0c080cb
[Dynamo] Simplify torch function mode stack guard (#135444)
mlazos Sep 12, 2024
2af3b8f
[Dynamo] Trace enter/exit of TorchFunctionModes (#135422)
mlazos Sep 12, 2024
7d5e0dd
[Dynamo] Remove ignored modes workaround (#135502)
mlazos Sep 12, 2024
c56728b
[Dynamo] Remove ignored modes from torch function mode stack guard (#…
mlazos Sep 12, 2024
31007cf
[Distributed] add FP8 support to NaN checker (#135891)
kwen2501 Sep 13, 2024
91d26d9
update
Chao1Han Sep 13, 2024
2f53d57
Update document for autocast on CPU (#135299)
CaoE Sep 13, 2024
ea2ecab
[AOTI][reland] Fix assert_function call in cpu autotune template (#13…
desertfire Sep 13, 2024
b5c52e9
Revert "[dynamo] Fix support for classmethod(property(...)) (#134968)"
pytorchmergebot Sep 13, 2024
1cdf658
Revert "[PT2][inductor][Optimus] Add pad_aten_mm_pass pattern to reso…
pytorchmergebot Sep 13, 2024
dc71e7a
Revert "[Dynamo] Remove ignored modes from torch function mode stack …
pytorchmergebot Sep 13, 2024
fca58bf
Revert "[Dynamo] Remove ignored modes workaround (#135502)"
pytorchmergebot Sep 13, 2024
ac16979
Revert "[Dynamo] Trace enter/exit of TorchFunctionModes (#135422)"
pytorchmergebot Sep 13, 2024
4734e35
Revert "[Dynamo] Simplify torch function mode stack guard (#135444)"
pytorchmergebot Sep 13, 2024
3f30360
Revert "[Dynamo] Support thread local setattr (#135443)"
pytorchmergebot Sep 13, 2024
eb7dd91
Revert "[Dynamo] Trace torch function modes entered outside of torch.…
pytorchmergebot Sep 13, 2024
7ed0563
Revert "[Dynamo] Use custom backend to reenter metadata tf mode when …
pytorchmergebot Sep 13, 2024
ba6e0f3
Remove cycle dependency by localizing the import. (#135926)
laithsakka Sep 13, 2024
2519e5a
[CUDA][FP8] Skip rowwise scaling test on sm89 (#135718)
eqy Sep 13, 2024
21ffa18
Fix "expand: SymIntArrayRef expected to contain only concrete integer…
ezyang Sep 13, 2024
ad2f0e9
Add remote cache time saved to compilation metrics (#135490)
jamesjwu Sep 13, 2024
ae02d66
[FlexAttention] Fix output layout (#135882)
drisspg Sep 13, 2024
564d00f
Revert "Fix clang-tidy warnings in Caffe2 code (#134935)"
pytorchmergebot Sep 13, 2024
a157745
[ROCm] Enable ROCm support for inductor's dynamic_rblock_scaling (#1…
jataylo Sep 13, 2024
6ef49fe
Revert "Pass ideep:lowp_kind to matmul_forward::compute on cache miss…
pytorchmergebot Sep 13, 2024
7834c0b
[AOTI][Tooling] Add stats summary (mean/min/max, etc) for jit inducto…
YUNQIUGUO Sep 13, 2024
bc0f330
[trymerge] Manually close merged PR when Github fails (#135890)
clee2000 Sep 13, 2024
18f9331
Revert "[aoti] Fix workspace generation for triton (#135552)"
pytorchmergebot Sep 13, 2024
3f69410
[gpu-profiler] Expose active and repeat in os env var (#135757)
dshi7 Sep 13, 2024
deee21c
Revert "[Inductor] Rename `cpp_wrapper_cuda.py` as `cpp_wrapper_gpu.p…
pytorchmergebot Sep 13, 2024
b6d6aa4
Revert "Validate input types for `torch.nn.Linear` and `torch.nn.Bili…
pytorchmergebot Sep 13, 2024
835e7bb
fix requirements.txt installation failure issue on Windows (#134567)
jingxu10 Sep 13, 2024
b856f35
Fix script name in the comments (#135507)
kit1980 Sep 13, 2024
4312794
[reland][export] fix re-export custom metadata (#135720)
yiming0416 Sep 13, 2024
a3d827a
Use python 3.11 for Large Wheel build (#136042)
atalman Sep 13, 2024
2e461e5
Add gpu and gpu_dynamic versions of add_loop (#135809)
laithsakka Sep 13, 2024
4f407c1
Only measure compile time instruction count for sum_floordiv benchmar…
laithsakka Sep 13, 2024
46935c8
Reduce default iterations to 5 . (#135773)
laithsakka Sep 13, 2024
a30d5ba
Fix bug in split-build workflows codegen (#136043)
malfet Sep 13, 2024
db5e1b4
Fix inductor-micro-benchmark results upload (take 2) (#136052)
huydhn Sep 13, 2024
baff86d
[MTIA tensor] allow shallow copy between CPU and MTIA tensors (#135871)
jvandebon Sep 13, 2024
e2d3af4
[ONNX] Remove logging apis from public (#133825)
justinchuby Sep 13, 2024
3c5d44d
Cleanup unused runner variants (#136058)
ZainRizvi Sep 13, 2024
aad556a
[PT2][Inductor][Optimus] Fix a corner case in remove_split_with_size_…
mengluy0125 Sep 13, 2024
b8eef50
Fix attr check for quantization spec (#135736)
jerryzh168 Sep 12, 2024
b608ff3
[Easy] Dont match to mm_plus_mm if not in max autotune (#135929)
eellison Sep 13, 2024
a00faf4
[3.13] fix 3.13 pickle error in serialization.py (#136034)
williamwen42 Sep 13, 2024
4237592
[Distributed] add pack-check method for float8_e4m3fn (#135961)
kwen2501 Sep 13, 2024
081c4a9
[BE] Use squeeze/unsqueeze in im2col (#136006)
malfet Sep 14, 2024
06bc717
Fix sum() forward for NJT (#131945)
jbschlosser Sep 13, 2024
2a83d68
update
Chao1Han Sep 14, 2024
7f62b86
update
Chao1Han Sep 14, 2024
5de4cb8
[Inductor UT] Generalize inductor UT for intel GPU (Part 3) (#135827)
hoshibara Sep 14, 2024
95496e4
[CI] Check that PyTorch is built with OpenMP (#136060)
malfet Sep 14, 2024
2e8d431
Fix tensor.data_ptr() representation overflow (#135567)
guangyey Sep 10, 2024
c48f5eb
Support reduce_scatter_base
Chao1Han Sep 14, 2024
51c5206
Use _amp_foreach_non_finite_check_and_unscale_ for CPU grads of Shard…
CaoE Sep 14, 2024
1786a17
Revert "Use _amp_foreach_non_finite_check_and_unscale_ for CPU grads …
pytorchmergebot Sep 14, 2024
731b178
[Dynamo] Use custom backend to reenter metadata tf mode when tracing …
mlazos Sep 13, 2024
4528777
[Dynamo] Trace torch function modes entered outside of torch.compile …
mlazos Sep 13, 2024
149d0b7
[Dynamo] Support thread local setattr (#135443)
mlazos Sep 13, 2024
ce3c74f
[Dynamo] Simplify torch function mode stack guard (#135444)
mlazos Sep 13, 2024
7743149
[Dynamo] Trace enter/exit of TorchFunctionModes (#135422)
mlazos Sep 13, 2024
5c67cf1
[Dynamo] Remove ignored modes workaround (#135502)
mlazos Sep 13, 2024
e77bd0e
[Dynamo] Remove ignored modes from torch function mode stack guard (#…
mlazos Sep 13, 2024
9b17dc4
Support reduce_scatter_tensor_coalesced
Chao1Han Sep 14, 2024
6cb3227
support barrier
Chao1Han Sep 14, 2024
911a43f
[TCPStore] Remove deprecated constructor (#136004)
fduwjj Sep 13, 2024
b9b6094
[ROCm] Skip pointwise associative scan tests due to regression (#135995)
jataylo Sep 14, 2024
e59f051
Merge branch 'xccl' into xccl-group
Chao1Han Sep 14, 2024
1a67e2b
[MPS] Add native im2col (#135706)
malfet Sep 14, 2024
d858c81
update
Chao1Han Sep 14, 2024
fea20f5
update
Chao1Han Sep 14, 2024
44dd218
Disable garbage collection during compile_time_instructions count in …
laithsakka Sep 13, 2024
a9bef85
[CI] Increase open file handles limit to 16K on MacOS (#136061)
malfet Sep 14, 2024
5a2be19
[Traceable FSDP2] Don't register RegisterPostBackwardFunction if user…
yf225 Sep 13, 2024
3352c9a
Add higher order operator name to the cache bypass exception (#135876)
oulgen Sep 13, 2024
e0e27f3
update
Chao1Han Sep 14, 2024
a815611
[Traceable FSDP2][Partitioner] Must save AC output if output has a ba…
yf225 Sep 14, 2024
f96a073
Use _amp_foreach_non_finite_check_and_unscale_ for CPU grads of Shard…
CaoE Sep 14, 2024
41b58a1
OpenReg: Fix issue when copying on the same device (#135956)
Zhenbin-8 Sep 14, 2024
72b868d
Revert "[Dynamo] Remove ignored modes from torch function mode stack …
pytorchmergebot Sep 14, 2024
838c912
Revert "[Dynamo] Remove ignored modes workaround (#135502)"
pytorchmergebot Sep 14, 2024
f3180f0
Revert "[Dynamo] Trace enter/exit of TorchFunctionModes (#135422)"
pytorchmergebot Sep 14, 2024
7975ec3
Revert "[Dynamo] Simplify torch function mode stack guard (#135444)"
pytorchmergebot Sep 14, 2024
46f5037
Revert "[Dynamo] Support thread local setattr (#135443)"
pytorchmergebot Sep 14, 2024
8c8a308
Revert "[Dynamo] Trace torch function modes entered outside of torch.…
pytorchmergebot Sep 14, 2024
23dec79
Revert "[Dynamo] Use custom backend to reenter metadata tf mode when …
pytorchmergebot Sep 14, 2024
db393fb
Add Half support for reflection and replication padding on CPU (#135931)
CaoE Sep 14, 2024
f97cccf
[3.13] fix 3.13 pickle error in torch/package (#136049)
williamwen42 Sep 13, 2024
b863750
[Pytorch] Consolidate Strobelight compile time profiler between OSS a…
kollasb Sep 14, 2024
b82122b
Only keep ListOfLinears module in basic_modules_benchmarks and add gp…
laithsakka Sep 13, 2024
b4c84c3
[AOTI] Fix a fallback op returning None issue (#135997)
desertfire Sep 13, 2024
228760b
[Dynamo] Use custom backend to reenter metadata tf mode when tracing …
mlazos Sep 14, 2024
5c5c33a
[Dynamo] Trace torch function modes entered outside of torch.compile …
mlazos Sep 14, 2024
14cabdf
[Dynamo] Support thread local setattr (#135443)
mlazos Sep 14, 2024
06caa2d
[Dynamo] Simplify torch function mode stack guard (#135444)
mlazos Sep 14, 2024
1b9daeb
[Dynamo] Trace enter/exit of TorchFunctionModes (#135422)
mlazos Sep 14, 2024
860838e
[Dynamo] Remove ignored modes workaround (#135502)
mlazos Sep 14, 2024
8df01c8
[Dynamo] Remove ignored modes from torch function mode stack guard (#…
mlazos Sep 14, 2024
7f5abb4
[BE][Ez]: Update pybind11 to 2.13.6. Exposes new conduit cross-compat…
Skylion007 Sep 14, 2024
c64ae60
[dynamo] Fix support for classmethod(property(...)) (#134968)
jansel Sep 14, 2024
55299cf
[BE]: Update mypy to 1.11.2 (#133816)
Skylion007 Sep 14, 2024
e498b02
Add Triton CPU as an Inductor backend (#133408)
int3 Sep 13, 2024
426580a
Add CI for Triton CPU backend (#135342)
int3 Sep 13, 2024
5b21d91
Fix dividing Mul by factor (#136079)
isuruf Sep 14, 2024
391f2d6
use a fast expand algorithm (#135999)
isuruf Sep 13, 2024
a5eb43d
Add TensorReferenceAnalysis and some tests (#135886)
bobrenjc93 Sep 14, 2024
a1a57a4
Optimize dict reconstruct to not codegen untouched values (#134876)
guilhermeleobas Sep 12, 2024
8072ebc
SKIP llama for dynamic size testing (#135960)
leslie-fang-intel Sep 13, 2024
386884e
[Traceable FSDP2] Ignore FSDP2 forward hook side-effects in AC; Suppo…
yf225 Sep 14, 2024
e1abd34
[audio hash update] update the pinned audio hash (#136106)
pytorchupdatebot Sep 15, 2024
31e42a4
Fix redundant move warnings by g++ (#134987)
cyyever Sep 15, 2024
357b7fb
Revert "[Pytorch] Consolidate Strobelight compile time profiler betwe…
pytorchmergebot Sep 15, 2024
382fad5
Deprecate _preserve_ops and consolidate with decomp_table (#135080)
tugsbayasgalan Sep 14, 2024
1904b09
Create export_for_inference API and expose core_aten as public facing…
tugsbayasgalan Sep 15, 2024
dec3403
Add some doc for export_for_training (#135918)
tugsbayasgalan Sep 15, 2024
a141c6b
[pytorch][monitoring] Dynamic backend for WaitCounter (#135967)
andriigrynenko Sep 15, 2024
ab9a7ea
Add decomposition for permute_copy (#130944)
rec Sep 10, 2024
e501ed7
Update link in distributed.tensor.parallel.rst (#136103)
H-Huang Sep 15, 2024
d2207c5
[Distributed] add pack-check method for float8_e5m2 (#136115)
kwen2501 Sep 15, 2024
9961aaa
[dynamo] simplify implementation for `functools.reduce` (#133778)
XuehaiPan Sep 14, 2024
951c21d
[dynamo] simplify implementation for `builtins.sum` (#133779)
XuehaiPan Sep 14, 2024
3117f2c
Revert "[BE]: Update mypy to 1.11.2 (#133816)"
pytorchmergebot Sep 16, 2024
bbc3fdb
Add python 3.13.0t build to Docker images (#136001)
atalman Sep 16, 2024
a803cb0
[AOTI] Refactor how cpp_wrapper specific options are set (#136035)
desertfire Sep 14, 2024
d833f49
[reland][Inductor] Rename `cpp_wrapper_cuda.py` as `cpp_wrapper_gpu.p…
desertfire Sep 16, 2024
13bd125
Delete stable prototype (#135911)
bigfootjon Sep 16, 2024
c33b058
Add decomposition for squeeze_copy (#130941)
rec Sep 16, 2024
090046b
[effects] Turn off dtype promotion for with_effects lowering (#136039)
IvanKobzarev Sep 13, 2024
0aa41eb
[ONNX] Run type promotion test in CI and update the table (#135915)
justinchuby Sep 16, 2024
b491e29
[BE][Ez]: Add full half/bfloat16 dtype for `unique` and `isin` (#136114)
Skylion007 Sep 16, 2024
0199fd4
Revert "[inductor] More fixes on the keys of `constants` and `signatu…
pytorchmergebot Sep 16, 2024
5193f23
[Pytorch] Cleanup Strobelight URL and shorten for readability (#136102)
kollasb Sep 16, 2024
23c0d26
[BE][Ez]: Fix missing float16 coverage for adaptive_pool3d_cpu (#136091)
Skylion007 Sep 16, 2024
7fe004f
Revert "Add CI for Triton CPU backend (#135342)"
pytorchmergebot Sep 16, 2024
d0cebed
Revert "Add Triton CPU as an Inductor backend (#133408)"
pytorchmergebot Sep 16, 2024
d3647d1
Remove accidentally committed code (#136154)
malfet Sep 16, 2024
f89ce4d
`torch.nn.MultiheadAttention`: docs: improvement (#136111)
kuraga Sep 16, 2024
717fca2
Drop outdated section 'Running clang-tidy' in CONTRIBUTING.md (#136146)
eugenekoran Sep 16, 2024
c977bb7
[Distributed] fix FileSystemWriter __init__ (#136135)
kwen2501 Sep 16, 2024
38caf10
[EZ] Fix spelling typo (#136157)
malfet Sep 16, 2024
31715be
[BE]: Update mypy to 1.11.2 (#133816)
Skylion007 Sep 16, 2024
7537f74
Refactor FxGraphCache.load into separate functions, so that AOTAutogr…
jamesjwu Sep 16, 2024
a0c7029
[c10d][Reland] Remove Option for ProcessGroup and Expose backend Opti…
fduwjj Sep 11, 2024
abd16a8
[torch/multiprocessing] Use multiprocessing.reduction.register Forki…
kiukchung Sep 16, 2024
3c97b0a
Use ncclAlltoAllv and ncclAlltoAll API when supported (#134499)
dsjohns2 Sep 16, 2024
bfbcdf4
Revert "[dynamo] Fix support for classmethod(property(...)) (#134968)"
pytorchmergebot Sep 16, 2024
b76d1b7
Add scaling arguments to bsr_dense_addmm (#136104)
pearu Sep 16, 2024
c12536b
[ONNX] Treat CompositeImplicitAutograd ops as normal ops in decomp (#…
justinchuby Sep 16, 2024
071da87
use csv extention for test report in order for it to be uploaded to s…
laithsakka Sep 16, 2024
37a08b3
Revert "fix compiled_autograd deadlock throw (#135795)"
pytorchmergebot Sep 16, 2024
3f74310
Back out "Flip triton kernel default layout constraint to "needs_fixe…
tissue3 Sep 17, 2024
d463a81
inductor: dont use default_dtype during rng functionalization (#136041)
bdhirsh Sep 14, 2024
dc82d27
make view.dtype always return an alias (#136074)
bdhirsh Sep 14, 2024
408fe41
[DSD][EZ] Minor update in _state_dict_utils.py (#136165)
wz337 Sep 16, 2024
e248c1d
Update real device in FSDP state_dict_utils (#134994)
ankurneog Sep 17, 2024
3b5e268
Revert "Optimize dict reconstruct to not codegen untouched values (#1…
pytorchmergebot Sep 17, 2024
2c4ae81
Revert "Add decomposition for squeeze_copy (#130941)"
pytorchmergebot Sep 17, 2024
462b727
Revert "Add decomposition for permute_copy (#130944)"
pytorchmergebot Sep 17, 2024
913f97e
Don't run reshape pattern match on dynamic shape size tensor (#136100)
ezyang Sep 17, 2024
ece8267
Add back optim type hints that were lost when *.pyi files were remove…
mauvilsa Sep 17, 2024
67b14ce
[ONNX] Fix numpy method to return the correct type (#136162)
justinchuby Sep 17, 2024
63dc5df
[Fix]: Update CPUINFO submodule to fix support for NON-SVE ARM Hardwa…
ng-05 Sep 17, 2024
8e5bb35
[PT2] Port merge_concats_pass to PT2 pre_grad passes (#135527)
huxintong Sep 17, 2024
cc365fd
[MTIA] Support torch.cuda.get_device_capability equivalent API on MTI…
ttrung149 Sep 17, 2024
785e987
Delete links to non-existing `run_plan_mpi.cc` (#136204)
malfet Sep 17, 2024
a838284
Support rms_norm() for NJT (#135872)
jbschlosser Sep 17, 2024
ea10c07
[export] Deserialize args with python keyword names (#136036)
angelayi Sep 17, 2024
a4e9a1c
[TorchRec][PT2 IR][APF] short circuit the flatten/unflatten between E…
TroyGarden Sep 17, 2024
e3aa5e2
[NCCL] Don't override `waitUntilInitialized`'s setting of `comm->init…
eqy Sep 17, 2024
48d18fb
[PyTorch CUDA Allocator] Allow reuse of non-split blocks with better …
banitag1 Sep 17, 2024
a575ce0
[PyTorch Pinned Allocator] Add support of background thread to proces…
banitag1 Sep 17, 2024
f6f1504
[MPS] Fix 5D+ reductions over negative dimentions (#136198)
malfet Sep 17, 2024
cccf500
[c10d] remove sleep from watchdogHandler (#135760)
c-p-i-o Sep 18, 2024
b18ba94
[AO][Inductor] Enable WOQ fusion pattern with permute (#135928)
leslie-fang-intel Sep 13, 2024
6682327
[BE] Make `NestedTensorTransformerFunctions.cu` compilable without wa…
malfet Sep 18, 2024
029026d
add ut
Chao1Han Sep 18, 2024
8895f69
[torch/numpy][numpy2.0 compat] Additional changes for tests to run u…
kiukchung Sep 18, 2024
9aa22ea
[CI] Make linux-aarch64 shards actually running different tests (#136…
malfet Sep 18, 2024
a0207c8
[dynamo] Fix support for classmethod(property(...)) (#134968)
jansel Sep 17, 2024
083c914
Reland D62220158 (#136213)
mengluy0125 Sep 18, 2024
b5be4d8
Fix ROCm skip decorator for test_ddp_tp and multiprocess UTs (#136161)
pragupta Sep 18, 2024
701ba52
[Inductor] Increase multiplier to 3 for Inductor AMP FP16 benchmark c…
jiayisunx Sep 13, 2024
c8d152c
Fix fast_expand recursion error (#136163)
isuruf Sep 16, 2024
6a6f5b2
Add _addmm_activation to lower precision cast policy on AutocastCPU (…
CaoE Sep 18, 2024
605f2d8
[PyTorch] Remove unnecessary include of c10/util/Exception.h in irang…
swolchok Sep 17, 2024
3efaa01
[c10d] Make test compatible for new pytest (#136158)
fduwjj Sep 17, 2024
bad6904
[ROCm] upgrade ROCm CI builds to py3.10 (#134108)
jataylo Sep 18, 2024
5a6ddbc
Extending the Pytorch vec backend for SVE (ARM) (#119571)
maajidkhann Sep 18, 2024
68a7246
[cuDNN][conv][A100] Bump tolerances for `vmap_autograd_grad` `conv2d`…
eqy Sep 18, 2024
aae68e2
Add wait counter for nccl abort (#136067)
atuljangra Sep 18, 2024
1a86d8a
Fix calling Add._from_args and Mul._from_args (#136143)
isuruf Sep 16, 2024
bc9597b
[Traceable FSDP2] Minor refactor to traceable FSDP2 unit tests (#136219)
yf225 Sep 18, 2024
f1ad680
[dynamo]Remove stream hardcoding in dynamo VariableBuilder (#131763)
siju-samuel Sep 18, 2024
b9a197d
[BE][MPS] Delete duplicated code in `View.mm` (#136295)
malfet Sep 18, 2024
068c80e
[BE][MPS] Fix deprecation warnings on MacOS 15.0 (#136292)
malfet Sep 18, 2024
f2b0fc8
Add uint16 support for observer (#136238)
jerryzh168 Sep 18, 2024
e037bb3
[dynamo] fix crash in InspectSignatureVariable (#136010)
williamwen42 Sep 17, 2024
7755176
Add type checks for Tensor.add_ (#135864)
DuyguA Sep 19, 2024
001dac2
use lintrunner format code
Chao1Han Sep 19, 2024
db80b98
XFAIL test_segfault (#136252)
huydhn Sep 19, 2024
f13b449
rm allgatherv align with nccl
Chao1Han Sep 19, 2024
156c2ac
update
Chao1Han Sep 19, 2024
908a568
Return unsafe_view instead of view from matmul when folding occurs (#…
jwieczorekhabana Sep 19, 2024
bce52d0
[CODEMOD][caffe2] use npt.NDArray instead of np.ndarray in type annot…
igorsugak Sep 19, 2024
4ea741d
Revert "Reland D62220158 (#136213)"
pytorchmergebot Sep 19, 2024
65df26f
[FSDP2] Fixed 2D mismatched grad placements (#136237)
awgu Sep 18, 2024
803ce50
Log structured logging overhead to dynamo compile (kinda) (#136142)
jamesjwu Sep 19, 2024
8d9c427
Type _sympy/functions.py [1/n] (#136205)
bobrenjc93 Sep 19, 2024
ccca3de
[ROCm] Enable Flex attention tests on AMD gpus (#136245)
jerrymannil Sep 19, 2024
49723a8
fix stride compare failed when size value equal to one in ForeachUtil…
Shan19900305 Sep 19, 2024
8cba0ec
[AOTI][Tooling][8/n] Add option to pinpoint kernel names in debug pri…
YUNQIUGUO Sep 19, 2024
b71802f
add basic_modules_ListOfLinears_inductor_gpu_force_shape_pad (#136175)
laithsakka Sep 17, 2024
7bbdf87
[22/N] Fix clang-tidy warnings in jit (#134829)
cyyever Sep 19, 2024
172ecf7
DTensor: dont hash symint tensor input in propagate_tensor_meta (#136…
bdhirsh Sep 18, 2024
9b424aa
[CI][CUSPARSELT] Extend cusparselt installation script to support cud…
nWEIdia Sep 19, 2024
79fd17e
Merge branch 'xccl' into xccl-group
Chao1Han Sep 20, 2024
bebf530
TCPStoreLibUvBackend: trace operations (#136320)
d4l3k Sep 20, 2024
1dfa07e
passing FileTimerRequests.to_json() to log_debug_info_for_expired_tim…
felixsu2006 Sep 20, 2024
d45b015
Add deterministic path for CUDA `cumsum` (#136224)
kurtamohler Sep 20, 2024
fe0e9fb
Fix flaky SIGSEGV crash in test_profile_memory (#136304)
huydhn Sep 20, 2024
652da01
Xccl process group for Pytorch
Chao1Han Aug 29, 2024
0cb0016
Merge remote-tracking branch 'upstream/main' into xccl-bak
Chao1Han Sep 20, 2024
a71d69a
Align latest
Chao1Han Sep 20, 2024
a1c2d6b
Merge branch 'xccl-bak' into xccl-group
Chao1Han Sep 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
cmake_dependent_option(USE_NCCL "Use NCCL" ON
"USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_XCCL "Use XCCL" ON
"USE_XPU;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
Expand Down Expand Up @@ -353,6 +355,8 @@ cmake_dependent_option(USE_C10D_GLOO "USE C10D GLOO" ON
"USE_DISTRIBUTED;USE_GLOO" OFF)
cmake_dependent_option(USE_C10D_NCCL "USE C10D NCCL" ON
"USE_DISTRIBUTED;USE_NCCL" OFF)
cmake_dependent_option(USE_C10D_XCCL "USE C10D XCCL" ON
"USE_DISTRIBUTED;USE_XCCL" OFF)
cmake_dependent_option(USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI"
OFF)
cmake_dependent_option(
Expand All @@ -365,6 +369,8 @@ cmake_dependent_option(
USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
cmake_dependent_option(
USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
cmake_dependent_option(
USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF)
cmake_dependent_option(
USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
cmake_dependent_option(
Expand Down
4 changes: 4 additions & 0 deletions build_variables.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,10 @@ libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_s
"torch/csrc/cuda/nccl.cpp",
]

libtorch_xpu_distributed_extra_sources = [
"torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
]

torch_cpp_srcs = [
"torch/csrc/api/src/cuda.cpp", # this just forwards stuff, no real CUDA
"torch/csrc/api/src/data/datasets/mnist.cpp",
Expand Down
13 changes: 13 additions & 0 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,9 @@ elseif(USE_CUDA)
endif()

if(USE_XPU)
if(USE_XCCL)
append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS)
endif()
add_library(torch_xpu ${Caffe2_XPU_SRCS})
torch_compile_options(torch_xpu) # see cmake/public/utils.cmake
target_compile_definitions(torch_xpu PRIVATE USE_XPU)
Expand Down Expand Up @@ -1079,6 +1082,10 @@ if(USE_XPU)
include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS})

endif()
if(USE_XCCL)
target_link_libraries(torch_xpu PRIVATE torch::xccl)
target_compile_definitions(torch_xpu PRIVATE USE_XCCL)
endif()
endif()

if(NOT MSVC AND USE_XNNPACK)
Expand Down Expand Up @@ -1365,6 +1372,12 @@ if(USE_DISTRIBUTED)
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
endif()
endif()
if(USE_C10D_XCCL)
target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
set_source_files_properties(
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_ZE;CCL_ENABLE_SYCL")
endif()
if(USE_MPI AND USE_C10D_MPI)
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set_source_files_properties(
Expand Down
1 change: 1 addition & 0 deletions caffe2/core/macros.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
{"USE_CUDNN", "${USE_CUDNN}"}, \
{"CUDNN_VERSION", "${CUDNN_VERSION}"}, \
{"USE_NCCL", "${USE_NCCL}"}, \
{"USE_XCCL", "${USE_XCCL}"}, \
{"USE_MPI", "${USE_MPI}"}, \
{"USE_GFLAGS", "${USE_GFLAGS}"}, \
{"USE_GLOG", "${USE_GLOG}"}, \
Expand Down
16 changes: 16 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1150,6 +1150,22 @@ if(USE_CUDA)
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
endif()

# ---[ XCCL
if(USE_XCCL)
if(NOT USE_XPU)
message(WARNING
"Not using XPU, so disabling USE_XCCL. Suppress this warning with "
"-DUSE_XCCL=OFF.")
caffe2_update_option(USE_XCCL OFF)
elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux")
message(WARNING "USE_XCCL is currently only supported under Linux.")
caffe2_update_option(USE_XCCL OFF)
else()
include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake)
list(APPEND Caffe2_XPU_DEPENDENCY_LIBS torch::xccl)
endif()
endif()

if(USE_DISTRIBUTED AND USE_TENSORPIPE)
if(MSVC)
message(WARNING "Tensorpipe cannot be used on Windows.")
Expand Down
17 changes: 17 additions & 0 deletions cmake/External/xccl.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
if(NOT __XCCL_INCLUDED)
set(__XCCL_INCLUDED TRUE)

if(USE_XCCL)
# XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
find_package(XCCL REQUIRED)
if(XCCL_FOUND)
add_library(torch::xccl INTERFACE IMPORTED)
set_property(
TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${XCCL_INCLUDE_DIR})
set_property(
TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
${XCCL_LIBRARY})
endif()
endif()
endif()
68 changes: 68 additions & 0 deletions cmake/Modules/FindXCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# This will define the following variables:
# XCCL_FOUND : True if the system has the XCCL library.
# XCCL_INCLUDE_DIR : Include directories needed to use XCCL.
# XCCL_LIBRARY_DIR :The path to the XCCL library.
# XCCL_LIBRARY : XCCL library fullname.

include(FindPackageHandleStandardArgs)

set(XCCL_ROOT "")
if(DEFINED ENV{CCL_ROOT})
set(XCCL_ROOT $ENV{CCL_ROOT})
endif()

string(COMPARE EQUAL "${XCCL_ROOT}" "" nosyclfound)
if(nosyclfound)
set(XCCL_FOUND False)
set(XCCL_REASON_FAILURE "XCCL library not set!!")
set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
return()
endif()

# Find include path from binary.
find_file(
XCCL_INCLUDE_DIR
NAMES include
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find include/oneapi path from include path.
find_file(
XCCL_INCLUDE_ONEAPI_DIR
NAMES oneapi
HINTS ${XCCL_ROOT}/include/
NO_DEFAULT_PATH
)

list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR})

# Find library directory from binary.
find_file(
XCCL_LIBRARY_DIR
NAMES lib
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find XCCL library fullname.
find_library(
XCCL_LIBRARY
NAMES ccl
HINTS ${XCCL_LIBRARY_DIR}
NO_DEFAULT_PATH
)

if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
set(XCCL_FOUND False)
set(XCCL_REASON_FAILURE "XCCL library is incomplete!!")
set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
return()
endif()

find_package_handle_standard_args(
XCCL
FOUND_VAR XCCL_FOUND
REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
REASON_FAILURE_MESSAGE "${XCCL_REASON_FAILURE}"
)
6 changes: 6 additions & 0 deletions cmake/Summary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ function(caffe2_print_configuration_summary)
message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}")
endif()
message(STATUS " USE_ITT : ${USE_ITT}")
message(STATUS " USE_XCCL : ${USE_XCCL}")
if(${USE_XCCL})
message(STATUS " USE_C10D_XCCL : ${USE_C10D_XCCL}")
message(STATUS " XCCL include path : ${XCCL_INCLUDE_DIR}")
message(STATUS " XCCL library : ${XCCL_LIBRARY}")
endif()
message(STATUS " USE_NCCL : ${USE_NCCL}")
if(${USE_NCCL})
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
Expand Down
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,10 @@ def run(self):
report("-- Building NCCL library")
else:
report("-- Not using NCCL")
if cmake_cache_vars["USE_XCCL"]:
report("-- Building XCCL library")
else:
report("-- Not using XCCL")
if cmake_cache_vars["USE_DISTRIBUTED"]:
if IS_WINDOWS:
report("-- Building without distributed package")
Expand Down
5 changes: 3 additions & 2 deletions test/distributed/test_c10d_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ def gpus_for_rank(world_size):
On a single node, all visible GPUs are evenly
divided to subsets, each process only uses a subset.
"""
visible_devices = list(range(torch.cuda.device_count()))
gpus_per_process = torch.cuda.device_count() // world_size
device_count = torch.xpu.device_count() if torch.xpu.is_available() else torch.cuda.device_count()
visible_devices = list(range(device_count))
gpus_per_process = device_count // world_size
gpus_for_rank = []
for rank in range(world_size):
gpus_for_rank.append(
Expand Down
Loading
Loading