From 34d876317a021df3095fa5a75382bc927ed81ce5 Mon Sep 17 00:00:00 2001 From: thecharlieblake <> Date: Thu, 7 Nov 2024 14:36:18 +0000 Subject: [PATCH] thecharlieblake published a site update --- _sources/index.rst.txt | 2 +- _sources/user_guide.rst.txt | 2 +- index.html | 2 +- objects.inv | Bin 8755 -> 8769 bytes searchindex.js | 2 +- user_guide.html | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt index de8eafe..13e9610 100644 --- a/_sources/index.rst.txt +++ b/_sources/index.rst.txt @@ -18,7 +18,7 @@ To install :code:`unit-scaling`, run: .. code-block:: - pip install git+https://github.com/graphcore-research/unit-scaling.git + pip install unit-scaling Getting Started --------------- diff --git a/_sources/user_guide.rst.txt b/_sources/user_guide.rst.txt index a8ec322..1a85eea 100644 --- a/_sources/user_guide.rst.txt +++ b/_sources/user_guide.rst.txt @@ -19,7 +19,7 @@ To install the :code:`unit-scaling` library, run: .. code-block:: - pip install git+https://github.com/graphcore-research/unit-scaling.git + pip install unit-scaling For those who wish to develop on the :code:`unit-scaling` codebase, clone or fork our `GitHub repo `_ and follow the diff --git a/index.html b/index.html index daafde4..9dec630 100644 --- a/index.html +++ b/index.html @@ -84,7 +84,7 @@

Unit Scaling

Installation

To install unit-scaling, run:

-
pip install git+https://github.com/graphcore-research/unit-scaling.git
+
pip install unit-scaling
 
diff --git a/objects.inv b/objects.inv index d51cfb1c389936e5681c905bd67d7ac9c66cdd9c..f8c4b730fb4438ec4b179891c3cf42ec7ec2ab56 100644 GIT binary patch delta 26 hcmdn&a?oXh6ThKeN@|&*c6yp=l4)XM%0}l8iU5Ft37h}` delta 12 TcmX@;ve{*V6Ql9Qzz>Q5B5?&4 diff --git a/searchindex.js b/searchindex.js index 99da586..8a22827 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"API reference": [[0, null]], "Almost-scaled dot-product attention": [[106, null]], "Contents": [[104, null]], "Development": [[104, "development"]], "Getting Started": [[104, "getting-started"]], "How to unit-scale a model": [[107, "how-to-unit-scale-a-model"]], "Installation": [[104, "installation"], [107, "installation"]], "Key considerations for unit scaling": [[107, "key-considerations-for-unit-scaling"]], "Limitations": [[105, null]], "Optimising unit-scaled models": [[107, "optimising-unit-scaled-models"]], "Unit Scaling": [[104, null]], "User guide": [[107, null]], "What is unit scaling?": [[107, "what-is-unit-scaling"]], "unit_scaling": [[1, null]], "unit_scaling.Conv1d": [[2, null]], "unit_scaling.CrossEntropyLoss": [[3, null]], "unit_scaling.DepthModuleList": [[4, null]], "unit_scaling.DepthSequential": [[5, null]], "unit_scaling.Dropout": [[6, null]], "unit_scaling.Embedding": [[7, null]], "unit_scaling.GELU": [[8, null]], "unit_scaling.LayerNorm": [[9, null]], "unit_scaling.Linear": [[10, null]], "unit_scaling.LinearReadout": [[11, null]], "unit_scaling.MHSA": [[12, null]], "unit_scaling.MLP": [[13, null]], "unit_scaling.Parameter": [[14, null]], "unit_scaling.RMSNorm": [[15, null]], "unit_scaling.SiLU": [[16, null]], "unit_scaling.Softmax": [[17, null]], "unit_scaling.TransformerDecoder": [[18, null]], "unit_scaling.TransformerLayer": [[19, null]], "unit_scaling.analysis": [[20, null]], "unit_scaling.analysis.example_batch": [[21, null]], "unit_scaling.analysis.graph_to_dataframe": [[22, null]], "unit_scaling.analysis.plot": [[23, null]], "unit_scaling.analysis.visualiser": [[24, null]], "unit_scaling.constraints": [[25, null]], "unit_scaling.constraints.amean": [[26, null]], "unit_scaling.constraints.apply_constraint": [[27, null]], "unit_scaling.constraints.gmean": [[28, null]], "unit_scaling.constraints.hmean": [[29, null]], "unit_scaling.constraints.to_grad_input_scale": [[30, null]], "unit_scaling.constraints.to_left_grad_scale": [[31, null]], "unit_scaling.constraints.to_output_scale": [[32, null]], "unit_scaling.constraints.to_right_grad_scale": [[33, null]], "unit_scaling.core": [[34, null]], "unit_scaling.core.functional": [[35, null]], "unit_scaling.core.functional.logarithmic_interpolation": [[36, null]], "unit_scaling.core.functional.rms": [[37, null]], "unit_scaling.core.functional.scale_elementwise": [[38, null]], "unit_scaling.core.functional.transformer_residual_scaling_rule": [[39, null]], "unit_scaling.formats": [[40, null]], "unit_scaling.formats.FPFormat": [[41, null]], "unit_scaling.formats.format_to_tuple": [[42, null]], "unit_scaling.formats.tuple_to_format": [[43, null]], "unit_scaling.functional": [[44, null]], "unit_scaling.functional.add": [[45, null]], "unit_scaling.functional.conv1d": [[46, null]], "unit_scaling.functional.cross_entropy": [[47, null]], "unit_scaling.functional.dropout": [[48, null]], "unit_scaling.functional.embedding": [[49, null]], "unit_scaling.functional.gelu": [[50, null]], "unit_scaling.functional.layer_norm": [[51, null]], "unit_scaling.functional.linear": [[52, null]], "unit_scaling.functional.linear_readout": [[53, null]], "unit_scaling.functional.matmul": [[54, null]], "unit_scaling.functional.mse_loss": [[55, null]], "unit_scaling.functional.residual_add": [[56, null]], "unit_scaling.functional.residual_apply": [[57, null]], "unit_scaling.functional.residual_split": [[58, null]], "unit_scaling.functional.rms_norm": [[59, null]], "unit_scaling.functional.scaled_dot_product_attention": [[60, null]], "unit_scaling.functional.silu": [[61, null]], "unit_scaling.functional.silu_glu": [[62, null]], "unit_scaling.functional.softmax": [[63, null]], "unit_scaling.optim": [[64, null]], "unit_scaling.optim.Adam": [[65, null]], "unit_scaling.optim.AdamW": [[66, null]], "unit_scaling.optim.SGD": [[67, null]], "unit_scaling.optim.lr_scale_for_depth": [[68, null]], "unit_scaling.optim.lr_scale_func_adam": [[69, null]], "unit_scaling.optim.lr_scale_func_sgd": [[70, null]], "unit_scaling.optim.scaled_parameters": [[71, null]], "unit_scaling.parameter": [[72, null]], "unit_scaling.parameter.OrderedDict": [[73, null]], "unit_scaling.parameter.Parameter": [[74, null]], "unit_scaling.parameter.ParameterData": [[75, null]], "unit_scaling.parameter.Protocol": [[76, null]], "unit_scaling.parameter.Tensor": [[77, null]], "unit_scaling.parameter.has_parameter_data": [[78, null]], "unit_scaling.scale": [[79, null]], "unit_scaling.scale.scale_bwd": [[80, null]], "unit_scaling.scale.scale_fwd": [[81, null]], "unit_scaling.transformer_residual_scaling_rule": [[82, null]], "unit_scaling.transforms": [[83, null]], "unit_scaling.transforms.Metrics": [[84, null]], "unit_scaling.transforms.compile": [[85, null]], "unit_scaling.transforms.prune_non_float_tensors": [[86, null]], "unit_scaling.transforms.prune_same_scale_tensors": [[87, null]], "unit_scaling.transforms.prune_selected_nodes": [[88, null]], "unit_scaling.transforms.simulate_format": [[89, null]], "unit_scaling.transforms.simulate_fp8": [[90, null]], "unit_scaling.transforms.track_scales": [[91, null]], "unit_scaling.transforms.unit_scale": [[92, null]], "unit_scaling.transforms.utils": [[93, null]], "unit_scaling.transforms.utils.apply_transform": [[94, null]], "unit_scaling.transforms.utils.patch_to_expand_modules": [[95, null]], "unit_scaling.transforms.utils.replace_node_with_function": [[96, null]], "unit_scaling.transforms.utils.torch_nn_modules_to_user_modules": [[97, null]], "unit_scaling.utils": [[98, null]], "unit_scaling.utils.ScalePair": [[99, null]], "unit_scaling.utils.ScaleTracker": [[100, null]], "unit_scaling.utils.ScaleTrackingInterpreter": [[101, null]], "unit_scaling.utils.analyse_module": [[102, null]], "unit_scaling.visualiser": [[103, null]]}, "docnames": ["api_reference", "generated/unit_scaling", "generated/unit_scaling.Conv1d", "generated/unit_scaling.CrossEntropyLoss", "generated/unit_scaling.DepthModuleList", "generated/unit_scaling.DepthSequential", "generated/unit_scaling.Dropout", "generated/unit_scaling.Embedding", "generated/unit_scaling.GELU", "generated/unit_scaling.LayerNorm", "generated/unit_scaling.Linear", "generated/unit_scaling.LinearReadout", "generated/unit_scaling.MHSA", "generated/unit_scaling.MLP", "generated/unit_scaling.Parameter", "generated/unit_scaling.RMSNorm", "generated/unit_scaling.SiLU", "generated/unit_scaling.Softmax", "generated/unit_scaling.TransformerDecoder", "generated/unit_scaling.TransformerLayer", "generated/unit_scaling.analysis", "generated/unit_scaling.analysis.example_batch", "generated/unit_scaling.analysis.graph_to_dataframe", "generated/unit_scaling.analysis.plot", "generated/unit_scaling.analysis.visualiser", "generated/unit_scaling.constraints", "generated/unit_scaling.constraints.amean", "generated/unit_scaling.constraints.apply_constraint", "generated/unit_scaling.constraints.gmean", "generated/unit_scaling.constraints.hmean", "generated/unit_scaling.constraints.to_grad_input_scale", "generated/unit_scaling.constraints.to_left_grad_scale", "generated/unit_scaling.constraints.to_output_scale", "generated/unit_scaling.constraints.to_right_grad_scale", "generated/unit_scaling.core", "generated/unit_scaling.core.functional", "generated/unit_scaling.core.functional.logarithmic_interpolation", "generated/unit_scaling.core.functional.rms", "generated/unit_scaling.core.functional.scale_elementwise", "generated/unit_scaling.core.functional.transformer_residual_scaling_rule", "generated/unit_scaling.formats", "generated/unit_scaling.formats.FPFormat", "generated/unit_scaling.formats.format_to_tuple", "generated/unit_scaling.formats.tuple_to_format", "generated/unit_scaling.functional", "generated/unit_scaling.functional.add", "generated/unit_scaling.functional.conv1d", "generated/unit_scaling.functional.cross_entropy", "generated/unit_scaling.functional.dropout", "generated/unit_scaling.functional.embedding", "generated/unit_scaling.functional.gelu", "generated/unit_scaling.functional.layer_norm", "generated/unit_scaling.functional.linear", "generated/unit_scaling.functional.linear_readout", "generated/unit_scaling.functional.matmul", "generated/unit_scaling.functional.mse_loss", "generated/unit_scaling.functional.residual_add", "generated/unit_scaling.functional.residual_apply", "generated/unit_scaling.functional.residual_split", "generated/unit_scaling.functional.rms_norm", "generated/unit_scaling.functional.scaled_dot_product_attention", "generated/unit_scaling.functional.silu", "generated/unit_scaling.functional.silu_glu", "generated/unit_scaling.functional.softmax", "generated/unit_scaling.optim", "generated/unit_scaling.optim.Adam", "generated/unit_scaling.optim.AdamW", "generated/unit_scaling.optim.SGD", "generated/unit_scaling.optim.lr_scale_for_depth", "generated/unit_scaling.optim.lr_scale_func_adam", "generated/unit_scaling.optim.lr_scale_func_sgd", "generated/unit_scaling.optim.scaled_parameters", "generated/unit_scaling.parameter", "generated/unit_scaling.parameter.OrderedDict", "generated/unit_scaling.parameter.Parameter", "generated/unit_scaling.parameter.ParameterData", "generated/unit_scaling.parameter.Protocol", "generated/unit_scaling.parameter.Tensor", "generated/unit_scaling.parameter.has_parameter_data", "generated/unit_scaling.scale", "generated/unit_scaling.scale.scale_bwd", "generated/unit_scaling.scale.scale_fwd", "generated/unit_scaling.transformer_residual_scaling_rule", "generated/unit_scaling.transforms", "generated/unit_scaling.transforms.Metrics", "generated/unit_scaling.transforms.compile", "generated/unit_scaling.transforms.prune_non_float_tensors", "generated/unit_scaling.transforms.prune_same_scale_tensors", "generated/unit_scaling.transforms.prune_selected_nodes", "generated/unit_scaling.transforms.simulate_format", "generated/unit_scaling.transforms.simulate_fp8", "generated/unit_scaling.transforms.track_scales", "generated/unit_scaling.transforms.unit_scale", "generated/unit_scaling.transforms.utils", "generated/unit_scaling.transforms.utils.apply_transform", "generated/unit_scaling.transforms.utils.patch_to_expand_modules", "generated/unit_scaling.transforms.utils.replace_node_with_function", "generated/unit_scaling.transforms.utils.torch_nn_modules_to_user_modules", "generated/unit_scaling.utils", "generated/unit_scaling.utils.ScalePair", "generated/unit_scaling.utils.ScaleTracker", "generated/unit_scaling.utils.ScaleTrackingInterpreter", "generated/unit_scaling.utils.analyse_module", "generated/unit_scaling.visualiser", "index", "limitations", "posts/almost_scaled_dot_product_attention", "user_guide"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["api_reference.rst", "generated/unit_scaling.rst", "generated/unit_scaling.Conv1d.rst", "generated/unit_scaling.CrossEntropyLoss.rst", "generated/unit_scaling.DepthModuleList.rst", "generated/unit_scaling.DepthSequential.rst", "generated/unit_scaling.Dropout.rst", "generated/unit_scaling.Embedding.rst", "generated/unit_scaling.GELU.rst", "generated/unit_scaling.LayerNorm.rst", "generated/unit_scaling.Linear.rst", "generated/unit_scaling.LinearReadout.rst", "generated/unit_scaling.MHSA.rst", "generated/unit_scaling.MLP.rst", "generated/unit_scaling.Parameter.rst", "generated/unit_scaling.RMSNorm.rst", "generated/unit_scaling.SiLU.rst", "generated/unit_scaling.Softmax.rst", "generated/unit_scaling.TransformerDecoder.rst", "generated/unit_scaling.TransformerLayer.rst", "generated/unit_scaling.analysis.rst", "generated/unit_scaling.analysis.example_batch.rst", "generated/unit_scaling.analysis.graph_to_dataframe.rst", "generated/unit_scaling.analysis.plot.rst", "generated/unit_scaling.analysis.visualiser.rst", "generated/unit_scaling.constraints.rst", "generated/unit_scaling.constraints.amean.rst", "generated/unit_scaling.constraints.apply_constraint.rst", "generated/unit_scaling.constraints.gmean.rst", "generated/unit_scaling.constraints.hmean.rst", "generated/unit_scaling.constraints.to_grad_input_scale.rst", "generated/unit_scaling.constraints.to_left_grad_scale.rst", "generated/unit_scaling.constraints.to_output_scale.rst", "generated/unit_scaling.constraints.to_right_grad_scale.rst", "generated/unit_scaling.core.rst", "generated/unit_scaling.core.functional.rst", "generated/unit_scaling.core.functional.logarithmic_interpolation.rst", "generated/unit_scaling.core.functional.rms.rst", "generated/unit_scaling.core.functional.scale_elementwise.rst", "generated/unit_scaling.core.functional.transformer_residual_scaling_rule.rst", "generated/unit_scaling.formats.rst", "generated/unit_scaling.formats.FPFormat.rst", "generated/unit_scaling.formats.format_to_tuple.rst", "generated/unit_scaling.formats.tuple_to_format.rst", "generated/unit_scaling.functional.rst", "generated/unit_scaling.functional.add.rst", "generated/unit_scaling.functional.conv1d.rst", "generated/unit_scaling.functional.cross_entropy.rst", "generated/unit_scaling.functional.dropout.rst", "generated/unit_scaling.functional.embedding.rst", "generated/unit_scaling.functional.gelu.rst", "generated/unit_scaling.functional.layer_norm.rst", "generated/unit_scaling.functional.linear.rst", "generated/unit_scaling.functional.linear_readout.rst", "generated/unit_scaling.functional.matmul.rst", "generated/unit_scaling.functional.mse_loss.rst", "generated/unit_scaling.functional.residual_add.rst", "generated/unit_scaling.functional.residual_apply.rst", "generated/unit_scaling.functional.residual_split.rst", "generated/unit_scaling.functional.rms_norm.rst", "generated/unit_scaling.functional.scaled_dot_product_attention.rst", "generated/unit_scaling.functional.silu.rst", "generated/unit_scaling.functional.silu_glu.rst", "generated/unit_scaling.functional.softmax.rst", "generated/unit_scaling.optim.rst", "generated/unit_scaling.optim.Adam.rst", "generated/unit_scaling.optim.AdamW.rst", "generated/unit_scaling.optim.SGD.rst", "generated/unit_scaling.optim.lr_scale_for_depth.rst", "generated/unit_scaling.optim.lr_scale_func_adam.rst", "generated/unit_scaling.optim.lr_scale_func_sgd.rst", "generated/unit_scaling.optim.scaled_parameters.rst", "generated/unit_scaling.parameter.rst", "generated/unit_scaling.parameter.OrderedDict.rst", "generated/unit_scaling.parameter.Parameter.rst", "generated/unit_scaling.parameter.ParameterData.rst", "generated/unit_scaling.parameter.Protocol.rst", "generated/unit_scaling.parameter.Tensor.rst", "generated/unit_scaling.parameter.has_parameter_data.rst", "generated/unit_scaling.scale.rst", "generated/unit_scaling.scale.scale_bwd.rst", "generated/unit_scaling.scale.scale_fwd.rst", "generated/unit_scaling.transformer_residual_scaling_rule.rst", "generated/unit_scaling.transforms.rst", "generated/unit_scaling.transforms.Metrics.rst", "generated/unit_scaling.transforms.compile.rst", "generated/unit_scaling.transforms.prune_non_float_tensors.rst", "generated/unit_scaling.transforms.prune_same_scale_tensors.rst", "generated/unit_scaling.transforms.prune_selected_nodes.rst", "generated/unit_scaling.transforms.simulate_format.rst", "generated/unit_scaling.transforms.simulate_fp8.rst", "generated/unit_scaling.transforms.track_scales.rst", "generated/unit_scaling.transforms.unit_scale.rst", "generated/unit_scaling.transforms.utils.rst", "generated/unit_scaling.transforms.utils.apply_transform.rst", "generated/unit_scaling.transforms.utils.patch_to_expand_modules.rst", "generated/unit_scaling.transforms.utils.replace_node_with_function.rst", "generated/unit_scaling.transforms.utils.torch_nn_modules_to_user_modules.rst", "generated/unit_scaling.utils.rst", "generated/unit_scaling.utils.ScalePair.rst", "generated/unit_scaling.utils.ScaleTracker.rst", "generated/unit_scaling.utils.ScaleTrackingInterpreter.rst", "generated/unit_scaling.utils.analyse_module.rst", "generated/unit_scaling.visualiser.rst", "index.rst", "limitations.rst", "posts/almost_scaled_dot_product_attention.md", "user_guide.rst"], "indexentries": {"abs() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.abs", false]], "abs_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.abs_", false]], "absolute() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.absolute", false]], "absolute_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.absolute_", false]], "acos() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.acos", false]], "acos_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.acos_", false]], "acosh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.acosh", false]], "acosh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.acosh_", false]], "adam (class in unit_scaling.optim)": [[65, "unit_scaling.optim.Adam", false]], "adamw (class in unit_scaling.optim)": [[66, "unit_scaling.optim.AdamW", false]], "add() (in module unit_scaling.functional)": [[45, "unit_scaling.functional.add", false]], "add() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.add", false]], "add_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.add_", false]], "add_param_group() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.add_param_group", false]], "add_param_group() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.add_param_group", false]], "add_param_group() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.add_param_group", false]], "addbmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addbmm", false]], "addbmm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addbmm_", false]], "addcdiv() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addcdiv", false]], "addcdiv_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addcdiv_", false]], "addcmul() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addcmul", false]], "addcmul_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addcmul_", false]], "addmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addmm", false]], "addmm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addmm_", false]], "addmv() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addmv", false]], "addmv_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addmv_", false]], "addr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addr", false]], "addr_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addr_", false]], "adjoint() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.adjoint", false]], "align_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.align_as", false]], "align_to() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.align_to", false]], "all() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.all", false]], "allclose() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.allclose", false]], "amax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.amax", false]], "amean() (in module unit_scaling.constraints)": [[26, "unit_scaling.constraints.amean", false]], "amin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.amin", false]], "aminmax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.aminmax", false]], "analyse_module() (in module unit_scaling.utils)": [[102, "unit_scaling.utils.analyse_module", false]], "angle() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.angle", false]], "any() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.any", false]], "append() (unit_scaling.depthmodulelist method)": [[4, "unit_scaling.DepthModuleList.append", false]], "append() (unit_scaling.depthsequential method)": [[5, "unit_scaling.DepthSequential.append", false]], "append() (unit_scaling.transformerdecoder method)": [[18, "unit_scaling.TransformerDecoder.append", false]], "apply_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.apply_", false]], "apply_constraint() (in module unit_scaling.constraints)": [[27, "unit_scaling.constraints.apply_constraint", false]], "apply_transform() (in module unit_scaling.transforms.utils)": [[94, "unit_scaling.transforms.utils.apply_transform", false]], "arccos() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arccos", false]], "arccos_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arccos_", false]], "arccosh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arccosh", false]], "arccosh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arccosh_", false]], "arcsin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arcsin", false]], "arcsin_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arcsin_", false]], "arcsinh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arcsinh", false]], "arcsinh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arcsinh_", false]], "arctan() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctan", false]], "arctan2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctan2", false]], "arctan2_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctan2_", false]], "arctan_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctan_", false]], "arctanh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctanh", false]], "arctanh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctanh_", false]], "argmax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.argmax", false]], "argmin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.argmin", false]], "argsort() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.argsort", false]], "argwhere() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.argwhere", false]], "as_strided() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.as_strided", false]], "as_strided_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.as_strided_", false]], "as_strided_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.as_strided_scatter", false]], "as_subclass() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.as_subclass", false]], "asin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.asin", false]], "asin_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.asin_", false]], "asinh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.asinh", false]], "asinh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.asinh_", false]], "atan() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atan", false]], "atan2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atan2", false]], "atan2_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atan2_", false]], "atan_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atan_", false]], "atanh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atanh", false]], "atanh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atanh_", false]], "backward() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.backward", false]], "backward() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.backward", false]], "baddbmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.baddbmm", false]], "baddbmm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.baddbmm_", false]], "bernoulli() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bernoulli", false]], "bernoulli_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bernoulli_", false]], "bfloat16() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bfloat16", false]], "bias (unit_scaling.conv1d attribute)": [[2, "unit_scaling.Conv1d.bias", false]], "bias (unit_scaling.layernorm attribute)": [[9, "unit_scaling.LayerNorm.bias", false]], "bias (unit_scaling.linear attribute)": [[10, "unit_scaling.Linear.bias", false]], "bias (unit_scaling.linearreadout attribute)": [[11, "unit_scaling.LinearReadout.bias", false]], "bincount() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bincount", false]], "bits (unit_scaling.formats.fpformat property)": [[41, "unit_scaling.formats.FPFormat.bits", false]], "bitwise_and() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_and", false]], "bitwise_and_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_and_", false]], "bitwise_left_shift() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_left_shift", false]], "bitwise_left_shift_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_left_shift_", false]], "bitwise_not() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_not", false]], "bitwise_not_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_not_", false]], "bitwise_or() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_or", false]], "bitwise_or_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_or_", false]], "bitwise_right_shift() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_right_shift", false]], "bitwise_right_shift_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_right_shift_", false]], "bitwise_xor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_xor", false]], "bitwise_xor_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_xor_", false]], "bmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bmm", false]], "bool() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bool", false]], "boxed_run() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.boxed_run", false]], "broadcast_to() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.broadcast_to", false]], "byte() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.byte", false]], "call_function() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.call_function", false]], "call_method() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.call_method", false]], "call_module() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.call_module", false]], "cauchy_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cauchy_", false]], "cdouble() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cdouble", false]], "ceil() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ceil", false]], "ceil_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ceil_", false]], "cfloat() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cfloat", false]], "chalf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.chalf", false]], "char() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.char", false]], "cholesky() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cholesky", false]], "cholesky_inverse() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cholesky_inverse", false]], "cholesky_solve() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cholesky_solve", false]], "chunk() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.chunk", false]], "clamp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clamp", false]], "clamp_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clamp_", false]], "clear() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.clear", false]], "clip() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clip", false]], "clip_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clip_", false]], "clone() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clone", false]], "coalesce() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.coalesce", false]], "col_indices() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.col_indices", false]], "compile() (in module unit_scaling.transforms)": [[85, "unit_scaling.transforms.compile", false]], "conj() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.conj", false]], "conj_physical() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.conj_physical", false]], "conj_physical_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.conj_physical_", false]], "contiguous() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.contiguous", false]], "conv1d (class in unit_scaling)": [[2, "unit_scaling.Conv1d", false]], "conv1d() (in module unit_scaling.functional)": [[46, "unit_scaling.functional.conv1d", false]], "copy() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.copy", false]], "copy_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.copy_", false]], "copysign() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.copysign", false]], "copysign_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.copysign_", false]], "corrcoef() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.corrcoef", false]], "cos() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cos", false]], "cos_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cos_", false]], "cosh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cosh", false]], "cosh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cosh_", false]], "count_nonzero() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.count_nonzero", false]], "cov() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cov", false]], "cpu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cpu", false]], "cross() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cross", false]], "cross_entropy() (in module unit_scaling.functional)": [[47, "unit_scaling.functional.cross_entropy", false]], "crossentropyloss (class in unit_scaling)": [[3, "unit_scaling.CrossEntropyLoss", false]], "crow_indices() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.crow_indices", false]], "cuda() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cuda", false]], "cummax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cummax", false]], "cummin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cummin", false]], "cumprod() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cumprod", false]], "cumprod_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cumprod_", false]], "cumsum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cumsum", false]], "cumsum_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cumsum_", false]], "data_ptr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.data_ptr", false]], "deg2rad() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.deg2rad", false]], "deg2rad_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.deg2rad_", false]], "dense_dim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dense_dim", false]], "depthmodulelist (class in unit_scaling)": [[4, "unit_scaling.DepthModuleList", false]], "depthsequential (class in unit_scaling)": [[5, "unit_scaling.DepthSequential", false]], "dequantize() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dequantize", false]], "det() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.det", false]], "detach() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.detach", false]], "detach_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.detach_", false]], "device (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.device", false]], "diag() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diag", false]], "diag_embed() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diag_embed", false]], "diagflat() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diagflat", false]], "diagonal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diagonal", false]], "diagonal_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diagonal_scatter", false]], "diff() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diff", false]], "digamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.digamma", false]], "digamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.digamma_", false]], "dim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dim", false]], "dim_order() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dim_order", false]], "dist() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dist", false]], "div() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.div", false]], "div_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.div_", false]], "divide() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.divide", false]], "divide_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.divide_", false]], "dot() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dot", false]], "double() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.double", false]], "dropout (class in unit_scaling)": [[6, "unit_scaling.Dropout", false]], "dropout() (in module unit_scaling.functional)": [[48, "unit_scaling.functional.dropout", false]], "dsplit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dsplit", false]], "element_size() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.element_size", false]], "embedding (class in unit_scaling)": [[7, "unit_scaling.Embedding", false]], "embedding() (in module unit_scaling.functional)": [[49, "unit_scaling.functional.embedding", false]], "eq() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.eq", false]], "eq_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.eq_", false]], "equal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.equal", false]], "erf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erf", false]], "erf_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erf_", false]], "erfc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erfc", false]], "erfc_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erfc_", false]], "erfinv() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erfinv", false]], "erfinv_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erfinv_", false]], "example_batch() (in module unit_scaling.analysis)": [[21, "unit_scaling.analysis.example_batch", false]], "exp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exp", false]], "exp2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exp2", false]], "exp2_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exp2_", false]], "exp_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exp_", false]], "expand() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.expand", false]], "expand_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.expand_as", false]], "expm1() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.expm1", false]], "expm1_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.expm1_", false]], "exponential_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exponential_", false]], "extend() (unit_scaling.depthmodulelist method)": [[4, "unit_scaling.DepthModuleList.extend", false]], "fetch_args_kwargs_from_env() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.fetch_args_kwargs_from_env", false]], "fetch_attr() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.fetch_attr", false]], "fill_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fill_", false]], "fill_diagonal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fill_diagonal_", false]], "fix() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fix", false]], "fix_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fix_", false]], "flatten() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.flatten", false]], "flip() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.flip", false]], "fliplr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fliplr", false]], "flipud() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.flipud", false]], "float() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.float", false]], "float_power() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.float_power", false]], "float_power_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.float_power_", false]], "floor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.floor", false]], "floor_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.floor_", false]], "floor_divide() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.floor_divide", false]], "floor_divide_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.floor_divide_", false]], "fmax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fmax", false]], "fmin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fmin", false]], "fmod() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fmod", false]], "fmod_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fmod_", false]], "format_to_tuple() (in module unit_scaling.formats)": [[42, "unit_scaling.formats.format_to_tuple", false]], "fpformat (class in unit_scaling.formats)": [[41, "unit_scaling.formats.FPFormat", false]], "frac() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.frac", false]], "frac_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.frac_", false]], "frexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.frexp", false]], "from_pretrained() (unit_scaling.embedding class method)": [[7, "unit_scaling.Embedding.from_pretrained", false]], "fromkeys() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.fromkeys", false]], "gather() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gather", false]], "gcd() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gcd", false]], "gcd_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gcd_", false]], "ge() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ge", false]], "ge_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ge_", false]], "gelu (class in unit_scaling)": [[8, "unit_scaling.GELU", false]], "gelu() (in module unit_scaling.functional)": [[50, "unit_scaling.functional.gelu", false]], "geometric_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.geometric_", false]], "geqrf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.geqrf", false]], "ger() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ger", false]], "get() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.get", false]], "get_attr() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.get_attr", false]], "get_device() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.get_device", false]], "gmean() (in module unit_scaling.constraints)": [[28, "unit_scaling.constraints.gmean", false]], "grad (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.grad", false]], "graph_to_dataframe() (in module unit_scaling.analysis)": [[22, "unit_scaling.analysis.graph_to_dataframe", false]], "greater() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.greater", false]], "greater_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.greater_", false]], "greater_equal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.greater_equal", false]], "greater_equal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.greater_equal_", false]], "gt() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gt", false]], "gt_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gt_", false]], "h (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.H", false]], "half() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.half", false]], "hardshrink() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.hardshrink", false]], "has_names() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.has_names", false]], "has_parameter_data() (in module unit_scaling.parameter)": [[78, "unit_scaling.parameter.has_parameter_data", false]], "heaviside() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.heaviside", false]], "heaviside_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.heaviside_", false]], "histc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.histc", false]], "histogram() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.histogram", false]], "hmean() (in module unit_scaling.constraints)": [[29, "unit_scaling.constraints.hmean", false]], "hsplit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.hsplit", false]], "hypot() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.hypot", false]], "hypot_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.hypot_", false]], "i0() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.i0", false]], "i0_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.i0_", false]], "igamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.igamma", false]], "igamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.igamma_", false]], "igammac() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.igammac", false]], "igammac_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.igammac_", false]], "imag (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.imag", false]], "index_add() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_add", false]], "index_add_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_add_", false]], "index_copy() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_copy", false]], "index_copy_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_copy_", false]], "index_fill() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_fill", false]], "index_fill_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_fill_", false]], "index_put() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_put", false]], "index_put_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_put_", false]], "index_reduce_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_reduce_", false]], "index_select() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_select", false]], "indices() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.indices", false]], "inner() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.inner", false]], "insert() (unit_scaling.depthmodulelist method)": [[4, "unit_scaling.DepthModuleList.insert", false]], "int() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.int", false]], "int_repr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.int_repr", false]], "inverse() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.inverse", false]], "ipu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ipu", false]], "is_coalesced() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_coalesced", false]], "is_complex() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_complex", false]], "is_conj() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_conj", false]], "is_contiguous() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_contiguous", false]], "is_cpu (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_cpu", false]], "is_cuda (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_cuda", false]], "is_floating_point() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_floating_point", false]], "is_inference() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_inference", false]], "is_ipu (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_ipu", false]], "is_leaf (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_leaf", false]], "is_meta (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_meta", false]], "is_mps (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_mps", false]], "is_neg() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_neg", false]], "is_pinned() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_pinned", false]], "is_quantized (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_quantized", false]], "is_set_to() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_set_to", false]], "is_shared() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_shared", false]], "is_signed() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_signed", false]], "is_sparse (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_sparse", false]], "is_sparse_csr (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_sparse_csr", false]], "is_xla (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_xla", false]], "is_xpu (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_xpu", false]], "isclose() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isclose", false]], "isfinite() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isfinite", false]], "isinf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isinf", false]], "isnan() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isnan", false]], "isneginf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isneginf", false]], "isposinf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isposinf", false]], "isreal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isreal", false]], "istft() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.istft", false]], "item() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.item", false]], "items() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.items", false]], "itemsize (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.itemsize", false]], "jvp() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.jvp", false]], "keys() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.keys", false]], "kron() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.kron", false]], "kthvalue() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.kthvalue", false]], "layer_norm() (in module unit_scaling.functional)": [[51, "unit_scaling.functional.layer_norm", false]], "layernorm (class in unit_scaling)": [[9, "unit_scaling.LayerNorm", false]], "lcm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lcm", false]], "lcm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lcm_", false]], "ldexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ldexp", false]], "ldexp_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ldexp_", false]], "le() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.le", false]], "le_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.le_", false]], "lerp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lerp", false]], "lerp_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lerp_", false]], "less() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.less", false]], "less_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.less_", false]], "less_equal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.less_equal", false]], "less_equal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.less_equal_", false]], "lgamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lgamma", false]], "lgamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lgamma_", false]], "linear (class in unit_scaling)": [[10, "unit_scaling.Linear", false]], "linear() (in module unit_scaling.functional)": [[52, "unit_scaling.functional.linear", false]], "linear_readout() (in module unit_scaling.functional)": [[53, "unit_scaling.functional.linear_readout", false]], "linearreadout (class in unit_scaling)": [[11, "unit_scaling.LinearReadout", false]], "load_state_dict() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.load_state_dict", false]], "load_state_dict() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.load_state_dict", false]], "load_state_dict() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.load_state_dict", false]], "log() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log", false]], "log10() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log10", false]], "log10_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log10_", false]], "log1p() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log1p", false]], "log1p_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log1p_", false]], "log2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log2", false]], "log2_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log2_", false]], "log_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log_", false]], "log_normal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log_normal_", false]], "logaddexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logaddexp", false]], "logaddexp2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logaddexp2", false]], "logarithmic_interpolation() (in module unit_scaling.core.functional)": [[36, "unit_scaling.core.functional.logarithmic_interpolation", false]], "logcumsumexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logcumsumexp", false]], "logdet() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logdet", false]], "logical_and() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_and", false]], "logical_and_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_and_", false]], "logical_not() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_not", false]], "logical_not_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_not_", false]], "logical_or() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_or", false]], "logical_or_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_or_", false]], "logical_xor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_xor", false]], "logical_xor_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_xor_", false]], "logit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logit", false]], "logit_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logit_", false]], "logsumexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logsumexp", false]], "long() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.long", false]], "lr_scale_for_depth() (in module unit_scaling.optim)": [[68, "unit_scaling.optim.lr_scale_for_depth", false]], "lr_scale_func_adam() (in module unit_scaling.optim)": [[69, "unit_scaling.optim.lr_scale_func_adam", false]], "lr_scale_func_sgd() (in module unit_scaling.optim)": [[70, "unit_scaling.optim.lr_scale_func_sgd", false]], "lt() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lt", false]], "lt_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lt_", false]], "lu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lu", false]], "lu_solve() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lu_solve", false]], "map_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.map_", false]], "map_nodes_to_values() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.map_nodes_to_values", false]], "mark_dirty() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.mark_dirty", false]], "mark_non_differentiable() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.mark_non_differentiable", false]], "masked_fill() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_fill", false]], "masked_fill_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_fill_", false]], "masked_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_scatter", false]], "masked_scatter_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_scatter_", false]], "masked_select() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_select", false]], "matmul() (in module unit_scaling.functional)": [[54, "unit_scaling.functional.matmul", false]], "matmul() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.matmul", false]], "matrix_exp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.matrix_exp", false]], "matrix_power() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.matrix_power", false]], "max() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.max", false]], "max_absolute_value (unit_scaling.formats.fpformat property)": [[41, "unit_scaling.formats.FPFormat.max_absolute_value", false]], "maximum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.maximum", false]], "mean() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mean", false]], "median() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.median", false]], "metrics (class in unit_scaling.transforms)": [[84, "unit_scaling.transforms.Metrics", false]], "metrics.data (class in unit_scaling.transforms)": [[84, "unit_scaling.transforms.Metrics.Data", false]], "mh (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.mH", false]], "mhsa (class in unit_scaling)": [[12, "unit_scaling.MHSA", false]], "min() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.min", false]], "min_absolute_normal (unit_scaling.formats.fpformat property)": [[41, "unit_scaling.formats.FPFormat.min_absolute_normal", false]], "min_absolute_subnormal (unit_scaling.formats.fpformat property)": [[41, "unit_scaling.formats.FPFormat.min_absolute_subnormal", false]], "minimum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.minimum", false]], "mlp (class in unit_scaling)": [[13, "unit_scaling.MLP", false]], "mm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mm", false]], "mode() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mode", false]], "module": [[1, "module-unit_scaling", false], [20, "module-unit_scaling.analysis", false], [25, "module-unit_scaling.constraints", false], [34, "module-unit_scaling.core", false], [35, "module-unit_scaling.core.functional", false], [40, "module-unit_scaling.formats", false], [44, "module-unit_scaling.functional", false], [64, "module-unit_scaling.optim", false], [72, "module-unit_scaling.parameter", false], [79, "module-unit_scaling.scale", false], [83, "module-unit_scaling.transforms", false], [93, "module-unit_scaling.transforms.utils", false], [98, "module-unit_scaling.utils", false]], "module_load() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.module_load", false]], "move_to_end() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.move_to_end", false]], "moveaxis() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.moveaxis", false]], "movedim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.movedim", false]], "mse_loss() (in module unit_scaling.functional)": [[55, "unit_scaling.functional.mse_loss", false]], "msort() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.msort", false]], "mt (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.mT", false]], "mtia() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mtia", false]], "mul() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mul", false]], "mul_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mul_", false]], "multinomial() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.multinomial", false]], "multiply() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.multiply", false]], "multiply_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.multiply_", false]], "mv() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mv", false]], "mvlgamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mvlgamma", false]], "mvlgamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mvlgamma_", false]], "names (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.names", false]], "nan_to_num() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nan_to_num", false]], "nan_to_num_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nan_to_num_", false]], "nanmean() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nanmean", false]], "nanmedian() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nanmedian", false]], "nanquantile() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nanquantile", false]], "nansum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nansum", false]], "narrow() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.narrow", false]], "narrow_copy() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.narrow_copy", false]], "nbytes (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.nbytes", false]], "ndim (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.ndim", false]], "ndimension() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ndimension", false]], "ne() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ne", false]], "ne_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ne_", false]], "neg() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.neg", false]], "neg_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.neg_", false]], "negative() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.negative", false]], "negative_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.negative_", false]], "nelement() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nelement", false]], "new_empty() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_empty", false]], "new_empty_strided() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_empty_strided", false]], "new_full() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_full", false]], "new_ones() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_ones", false]], "new_tensor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_tensor", false]], "new_zeros() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_zeros", false]], "nextafter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nextafter", false]], "nextafter_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nextafter_", false]], "nonzero() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nonzero", false]], "nonzero_static() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nonzero_static", false]], "norm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.norm", false]], "normal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.normal_", false]], "not_equal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.not_equal", false]], "not_equal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.not_equal_", false]], "numel() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.numel", false]], "numpy() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.numpy", false]], "ordereddict (class in unit_scaling.parameter)": [[73, "unit_scaling.parameter.OrderedDict", false]], "orgqr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.orgqr", false]], "ormqr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ormqr", false]], "outer() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.outer", false]], "output() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.output", false]], "parameter() (in module unit_scaling)": [[14, "unit_scaling.Parameter", false]], "parameter() (in module unit_scaling.parameter)": [[74, "unit_scaling.parameter.Parameter", false]], "parameterdata (class in unit_scaling.parameter)": [[75, "unit_scaling.parameter.ParameterData", false]], "patch_to_expand_modules() (in module unit_scaling.transforms.utils)": [[95, "unit_scaling.transforms.utils.patch_to_expand_modules", false]], "permute() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.permute", false]], "pin_memory() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.pin_memory", false]], "pinverse() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.pinverse", false]], "placeholder() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.placeholder", false]], "plot() (in module unit_scaling.analysis)": [[23, "unit_scaling.analysis.plot", false]], "polygamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.polygamma", false]], "polygamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.polygamma_", false]], "pop() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.pop", false]], "popitem() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.popitem", false]], "positive() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.positive", false]], "pow() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.pow", false]], "pow_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.pow_", false]], "prod() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.prod", false]], "protocol (class in unit_scaling.parameter)": [[76, "unit_scaling.parameter.Protocol", false]], "prune_non_float_tensors() (in module unit_scaling.transforms)": [[86, "unit_scaling.transforms.prune_non_float_tensors", false]], "prune_same_scale_tensors() (in module unit_scaling.transforms)": [[87, "unit_scaling.transforms.prune_same_scale_tensors", false]], "prune_selected_nodes() (in module unit_scaling.transforms)": [[88, "unit_scaling.transforms.prune_selected_nodes", false]], "put() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.put", false]], "put_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.put_", false]], "q_per_channel_axis() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_per_channel_axis", false]], "q_per_channel_scales() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_per_channel_scales", false]], "q_per_channel_zero_points() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_per_channel_zero_points", false]], "q_scale() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_scale", false]], "q_zero_point() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_zero_point", false]], "qr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.qr", false]], "qscheme() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.qscheme", false]], "quantile() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.quantile", false]], "quantise() (unit_scaling.formats.fpformat method)": [[41, "unit_scaling.formats.FPFormat.quantise", false]], "quantise_bwd() (unit_scaling.formats.fpformat method)": [[41, "unit_scaling.formats.FPFormat.quantise_bwd", false]], "quantise_fwd() (unit_scaling.formats.fpformat method)": [[41, "unit_scaling.formats.FPFormat.quantise_fwd", false]], "rad2deg() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rad2deg", false]], "rad2deg_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rad2deg_", false]], "random_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.random_", false]], "ravel() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ravel", false]], "real (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.real", false]], "reciprocal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.reciprocal", false]], "reciprocal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.reciprocal_", false]], "record_stream() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.record_stream", false]], "refine_names() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.refine_names", false]], "register_hook() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.register_hook", false]], "register_load_state_dict_post_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_load_state_dict_post_hook", false]], "register_load_state_dict_post_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_load_state_dict_post_hook", false]], "register_load_state_dict_post_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_load_state_dict_post_hook", false]], "register_load_state_dict_pre_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_load_state_dict_pre_hook", false]], "register_load_state_dict_pre_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_load_state_dict_pre_hook", false]], "register_load_state_dict_pre_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_load_state_dict_pre_hook", false]], "register_post_accumulate_grad_hook() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.register_post_accumulate_grad_hook", false]], "register_state_dict_post_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_state_dict_post_hook", false]], "register_state_dict_post_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_state_dict_post_hook", false]], "register_state_dict_post_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_state_dict_post_hook", false]], "register_state_dict_pre_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_state_dict_pre_hook", false]], "register_state_dict_pre_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_state_dict_pre_hook", false]], "register_state_dict_pre_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_state_dict_pre_hook", false]], "register_step_post_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_step_post_hook", false]], "register_step_post_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_step_post_hook", false]], "register_step_post_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_step_post_hook", false]], "register_step_pre_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_step_pre_hook", false]], "register_step_pre_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_step_pre_hook", false]], "register_step_pre_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_step_pre_hook", false]], "remainder() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.remainder", false]], "remainder_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.remainder_", false]], "rename() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rename", false]], "rename_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rename_", false]], "renorm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.renorm", false]], "renorm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.renorm_", false]], "repeat() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.repeat", false]], "repeat_interleave() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.repeat_interleave", false]], "replace_node_with_function() (in module unit_scaling.transforms.utils)": [[96, "unit_scaling.transforms.utils.replace_node_with_function", false]], "requires_grad (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.requires_grad", false]], "requires_grad_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.requires_grad_", false]], "reshape() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.reshape", false]], "reshape_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.reshape_as", false]], "residual_add() (in module unit_scaling.functional)": [[56, "unit_scaling.functional.residual_add", false]], "residual_apply() (in module unit_scaling.functional)": [[57, "unit_scaling.functional.residual_apply", false]], "residual_split() (in module unit_scaling.functional)": [[58, "unit_scaling.functional.residual_split", false]], "resize_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.resize_", false]], "resize_as_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.resize_as_", false]], "resolve_conj() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.resolve_conj", false]], "resolve_neg() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.resolve_neg", false]], "retain_grad() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.retain_grad", false]], "retains_grad (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.retains_grad", false]], "rms() (in module unit_scaling.core.functional)": [[37, "unit_scaling.core.functional.rms", false]], "rms_norm() (in module unit_scaling.functional)": [[59, "unit_scaling.functional.rms_norm", false]], "rmsnorm (class in unit_scaling)": [[15, "unit_scaling.RMSNorm", false]], "roll() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.roll", false]], "rot90() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rot90", false]], "round() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.round", false]], "round_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.round_", false]], "rsqrt() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rsqrt", false]], "rsqrt_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rsqrt_", false]], "run() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.run", false]], "run_node() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.run_node", false]], "save_for_backward() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.save_for_backward", false]], "save_for_forward() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.save_for_forward", false]], "scale_bwd() (in module unit_scaling.scale)": [[80, "unit_scaling.scale.scale_bwd", false]], "scale_elementwise() (in module unit_scaling.core.functional)": [[38, "unit_scaling.core.functional.scale_elementwise", false]], "scale_fwd() (in module unit_scaling.scale)": [[81, "unit_scaling.scale.scale_fwd", false]], "scaled_dot_product_attention() (in module unit_scaling.functional)": [[60, "unit_scaling.functional.scaled_dot_product_attention", false]], "scaled_parameters() (in module unit_scaling.optim)": [[71, "unit_scaling.optim.scaled_parameters", false]], "scalepair (class in unit_scaling.utils)": [[99, "unit_scaling.utils.ScalePair", false]], "scaletracker (class in unit_scaling.utils)": [[100, "unit_scaling.utils.ScaleTracker", false]], "scaletrackinginterpreter (class in unit_scaling.utils)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter", false]], "scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter", false]], "scatter_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_", false]], "scatter_add() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_add", false]], "scatter_add_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_add_", false]], "scatter_reduce() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_reduce", false]], "scatter_reduce_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_reduce_", false]], "select() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.select", false]], "select_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.select_scatter", false]], "set_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.set_", false]], "set_materialize_grads() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.set_materialize_grads", false]], "setdefault() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.setdefault", false]], "setup_context() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.setup_context", false]], "sgd (class in unit_scaling.optim)": [[67, "unit_scaling.optim.SGD", false]], "sgn() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sgn", false]], "sgn_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sgn_", false]], "shape (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.shape", false]], "share_memory_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.share_memory_", false]], "short() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.short", false]], "sigmoid() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sigmoid", false]], "sigmoid_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sigmoid_", false]], "sign() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sign", false]], "sign_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sign_", false]], "signbit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.signbit", false]], "silu (class in unit_scaling)": [[16, "unit_scaling.SiLU", false]], "silu() (in module unit_scaling.functional)": [[61, "unit_scaling.functional.silu", false]], "silu_glu() (in module unit_scaling.functional)": [[62, "unit_scaling.functional.silu_glu", false]], "simulate_format() (in module unit_scaling.transforms)": [[89, "unit_scaling.transforms.simulate_format", false]], "simulate_fp8() (in module unit_scaling.transforms)": [[90, "unit_scaling.transforms.simulate_fp8", false]], "sin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sin", false]], "sin_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sin_", false]], "sinc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sinc", false]], "sinc_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sinc_", false]], "sinh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sinh", false]], "sinh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sinh_", false]], "size() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.size", false]], "slice_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.slice_scatter", false]], "slogdet() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.slogdet", false]], "smm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.smm", false]], "softmax (class in unit_scaling)": [[17, "unit_scaling.Softmax", false]], "softmax() (in module unit_scaling.functional)": [[63, "unit_scaling.functional.softmax", false]], "softmax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.softmax", false]], "sort() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sort", false]], "sparse_dim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sparse_dim", false]], "sparse_mask() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sparse_mask", false]], "sparse_resize_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sparse_resize_", false]], "sparse_resize_and_clear_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sparse_resize_and_clear_", false]], "split() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.split", false]], "sqrt() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sqrt", false]], "sqrt_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sqrt_", false]], "square() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.square", false]], "square_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.square_", false]], "squeeze() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.squeeze", false]], "squeeze_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.squeeze_", false]], "sspaddmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sspaddmm", false]], "state_dict() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.state_dict", false]], "state_dict() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.state_dict", false]], "state_dict() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.state_dict", false]], "std() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.std", false]], "step() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.step", false]], "step() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.step", false]], "step() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.step", false]], "stft() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.stft", false]], "storage() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.storage", false]], "storage_offset() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.storage_offset", false]], "storage_type() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.storage_type", false]], "stride() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.stride", false]], "sub() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sub", false]], "sub_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sub_", false]], "subtract() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.subtract", false]], "subtract_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.subtract_", false]], "sum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sum", false]], "sum_to_size() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sum_to_size", false]], "svd() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.svd", false]], "swapaxes() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.swapaxes", false]], "swapaxes_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.swapaxes_", false]], "swapdims() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.swapdims", false]], "swapdims_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.swapdims_", false]], "t (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.T", false]], "t() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.t", false]], "t_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.t_", false]], "take() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.take", false]], "take_along_dim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.take_along_dim", false]], "tan() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tan", false]], "tan_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tan_", false]], "tanh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tanh", false]], "tanh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tanh_", false]], "tensor (class in unit_scaling.parameter)": [[77, "unit_scaling.parameter.Tensor", false]], "tensor_split() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tensor_split", false]], "tile() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tile", false]], "to() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to", false]], "to_dense() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_dense", false]], "to_grad_input_scale() (in module unit_scaling.constraints)": [[30, "unit_scaling.constraints.to_grad_input_scale", false]], "to_left_grad_scale() (in module unit_scaling.constraints)": [[31, "unit_scaling.constraints.to_left_grad_scale", false]], "to_mkldnn() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_mkldnn", false]], "to_output_scale() (in module unit_scaling.constraints)": [[32, "unit_scaling.constraints.to_output_scale", false]], "to_padded_tensor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_padded_tensor", false]], "to_right_grad_scale() (in module unit_scaling.constraints)": [[33, "unit_scaling.constraints.to_right_grad_scale", false]], "to_sparse() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse", false]], "to_sparse_bsc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_bsc", false]], "to_sparse_bsr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_bsr", false]], "to_sparse_coo() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_coo", false]], "to_sparse_csc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_csc", false]], "to_sparse_csr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_csr", false]], "tolist() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tolist", false]], "topk() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.topk", false]], "torch_nn_modules_to_user_modules() (in module unit_scaling.transforms.utils)": [[97, "unit_scaling.transforms.utils.torch_nn_modules_to_user_modules", false]], "trace() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.trace", false]], "track_scales() (in module unit_scaling.transforms)": [[91, "unit_scaling.transforms.track_scales", false]], "transformer_residual_scaling_rule() (in module unit_scaling)": [[82, "unit_scaling.transformer_residual_scaling_rule", false]], "transformer_residual_scaling_rule() (in module unit_scaling.core.functional)": [[39, "unit_scaling.core.functional.transformer_residual_scaling_rule", false]], "transformerdecoder (class in unit_scaling)": [[18, "unit_scaling.TransformerDecoder", false]], "transformerlayer (class in unit_scaling)": [[19, "unit_scaling.TransformerLayer", false]], "transpose() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.transpose", false]], "transpose_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.transpose_", false]], "triangular_solve() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.triangular_solve", false]], "tril() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tril", false]], "tril_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tril_", false]], "triu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.triu", false]], "triu_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.triu_", false]], "true_divide() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.true_divide", false]], "true_divide_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.true_divide_", false]], "trunc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.trunc", false]], "trunc_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.trunc_", false]], "tuple_to_format() (in module unit_scaling.formats)": [[43, "unit_scaling.formats.tuple_to_format", false]], "type() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.type", false]], "type_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.type_as", false]], "unbind() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unbind", false]], "unflatten() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unflatten", false]], "unfold() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unfold", false]], "uniform_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.uniform_", false]], "unique() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unique", false]], "unique_consecutive() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unique_consecutive", false]], "unit_scale() (in module unit_scaling.transforms)": [[92, "unit_scaling.transforms.unit_scale", false]], "unit_scaling": [[1, "module-unit_scaling", false]], "unit_scaling.analysis": [[20, "module-unit_scaling.analysis", false]], "unit_scaling.constraints": [[25, "module-unit_scaling.constraints", false]], "unit_scaling.core": [[34, "module-unit_scaling.core", false]], "unit_scaling.core.functional": [[35, "module-unit_scaling.core.functional", false]], "unit_scaling.formats": [[40, "module-unit_scaling.formats", false]], "unit_scaling.functional": [[44, "module-unit_scaling.functional", false]], "unit_scaling.optim": [[64, "module-unit_scaling.optim", false]], "unit_scaling.parameter": [[72, "module-unit_scaling.parameter", false]], "unit_scaling.scale": [[79, "module-unit_scaling.scale", false]], "unit_scaling.transforms": [[83, "module-unit_scaling.transforms", false]], "unit_scaling.transforms.utils": [[93, "module-unit_scaling.transforms.utils", false]], "unit_scaling.utils": [[98, "module-unit_scaling.utils", false]], "unsafe_chunk() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unsafe_chunk", false]], "unsafe_split() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unsafe_split", false]], "unsqueeze() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unsqueeze", false]], "unsqueeze_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unsqueeze_", false]], "untyped_storage() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.untyped_storage", false]], "update() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.update", false]], "values() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.values", false]], "values() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.values", false]], "var() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.var", false]], "vdot() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.vdot", false]], "view() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.view", false]], "view_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.view_as", false]], "visualiser() (in module unit_scaling)": [[103, "unit_scaling.visualiser", false]], "visualiser() (in module unit_scaling.analysis)": [[24, "unit_scaling.analysis.visualiser", false]], "vjp() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.vjp", false]], "vmap() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.vmap", false]], "vsplit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.vsplit", false]], "weight (unit_scaling.conv1d attribute)": [[2, "unit_scaling.Conv1d.weight", false]], "weight (unit_scaling.embedding attribute)": [[7, "unit_scaling.Embedding.weight", false]], "weight (unit_scaling.layernorm attribute)": [[9, "unit_scaling.LayerNorm.weight", false]], "weight (unit_scaling.linear attribute)": [[10, "unit_scaling.Linear.weight", false]], "weight (unit_scaling.linearreadout attribute)": [[11, "unit_scaling.LinearReadout.weight", false]], "weight (unit_scaling.rmsnorm attribute)": [[15, "unit_scaling.RMSNorm.weight", false]], "where() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.where", false]], "xlogy() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.xlogy", false]], "xlogy_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.xlogy_", false]], "xpu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.xpu", false]], "zero_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.zero_", false]], "zero_grad() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.zero_grad", false]], "zero_grad() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.zero_grad", false]], "zero_grad() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.zero_grad", false]]}, "objects": {"": [[1, 0, 0, "-", "unit_scaling"]], "unit_scaling": [[2, 1, 1, "", "Conv1d"], [3, 1, 1, "", "CrossEntropyLoss"], [4, 1, 1, "", "DepthModuleList"], [5, 1, 1, "", "DepthSequential"], [6, 1, 1, "", "Dropout"], [7, 1, 1, "", "Embedding"], [8, 1, 1, "", "GELU"], [9, 1, 1, "", "LayerNorm"], [10, 1, 1, "", "Linear"], [11, 1, 1, "", "LinearReadout"], [12, 1, 1, "", "MHSA"], [13, 1, 1, "", "MLP"], [14, 4, 1, "", "Parameter"], [15, 1, 1, "", "RMSNorm"], [16, 1, 1, "", "SiLU"], [17, 1, 1, "", "Softmax"], [18, 1, 1, "", "TransformerDecoder"], [19, 1, 1, "", "TransformerLayer"], [20, 0, 0, "-", "analysis"], [25, 0, 0, "-", "constraints"], [34, 0, 0, "-", "core"], [40, 0, 0, "-", "formats"], [44, 0, 0, "-", "functional"], [64, 0, 0, "-", "optim"], [72, 0, 0, "-", "parameter"], [79, 0, 0, "-", "scale"], [82, 4, 1, "", "transformer_residual_scaling_rule"], [83, 0, 0, "-", "transforms"], [98, 0, 0, "-", "utils"], [103, 4, 1, "", "visualiser"]], "unit_scaling.Conv1d": [[2, 2, 1, "", "bias"], [2, 2, 1, "", "weight"]], "unit_scaling.DepthModuleList": [[4, 3, 1, "", "append"], [4, 3, 1, "", "extend"], [4, 3, 1, "", "insert"]], "unit_scaling.DepthSequential": [[5, 3, 1, "", "append"]], "unit_scaling.Embedding": [[7, 3, 1, "", "from_pretrained"], [7, 2, 1, "", "weight"]], "unit_scaling.LayerNorm": [[9, 2, 1, "", "bias"], [9, 2, 1, "", "weight"]], "unit_scaling.Linear": [[10, 2, 1, "", "bias"], [10, 2, 1, "", "weight"]], "unit_scaling.LinearReadout": [[11, 2, 1, "", "bias"], [11, 2, 1, "", "weight"]], "unit_scaling.RMSNorm": [[15, 2, 1, "", "weight"]], "unit_scaling.TransformerDecoder": [[18, 3, 1, "", "append"]], "unit_scaling.analysis": [[21, 4, 1, "", "example_batch"], [22, 4, 1, "", "graph_to_dataframe"], [23, 4, 1, "", "plot"], [24, 4, 1, "", "visualiser"]], "unit_scaling.constraints": [[26, 4, 1, "", "amean"], [27, 4, 1, "", "apply_constraint"], [28, 4, 1, "", "gmean"], [29, 4, 1, "", "hmean"], [30, 4, 1, "", "to_grad_input_scale"], [31, 4, 1, "", "to_left_grad_scale"], [32, 4, 1, "", "to_output_scale"], [33, 4, 1, "", "to_right_grad_scale"]], "unit_scaling.core": [[35, 0, 0, "-", "functional"]], "unit_scaling.core.functional": [[36, 4, 1, "", "logarithmic_interpolation"], [37, 4, 1, "", "rms"], [38, 4, 1, "", "scale_elementwise"], [39, 4, 1, "", "transformer_residual_scaling_rule"]], "unit_scaling.formats": [[41, 1, 1, "", "FPFormat"], [42, 4, 1, "", "format_to_tuple"], [43, 4, 1, "", "tuple_to_format"]], "unit_scaling.formats.FPFormat": [[41, 5, 1, "", "bits"], [41, 5, 1, "", "max_absolute_value"], [41, 5, 1, "", "min_absolute_normal"], [41, 5, 1, "", "min_absolute_subnormal"], [41, 3, 1, "", "quantise"], [41, 3, 1, "", "quantise_bwd"], [41, 3, 1, "", "quantise_fwd"]], "unit_scaling.functional": [[45, 4, 1, "", "add"], [46, 4, 1, "", "conv1d"], [47, 4, 1, "", "cross_entropy"], [48, 4, 1, "", "dropout"], [49, 4, 1, "", "embedding"], [50, 4, 1, "", "gelu"], [51, 4, 1, "", "layer_norm"], [52, 4, 1, "", "linear"], [53, 4, 1, "", "linear_readout"], [54, 4, 1, "", "matmul"], [55, 4, 1, "", "mse_loss"], [56, 4, 1, "", "residual_add"], [57, 4, 1, "", "residual_apply"], [58, 4, 1, "", "residual_split"], [59, 4, 1, "", "rms_norm"], [60, 4, 1, "", "scaled_dot_product_attention"], [61, 4, 1, "", "silu"], [62, 4, 1, "", "silu_glu"], [63, 4, 1, "", "softmax"]], "unit_scaling.optim": [[65, 1, 1, "", "Adam"], [66, 1, 1, "", "AdamW"], [67, 1, 1, "", "SGD"], [68, 4, 1, "", "lr_scale_for_depth"], [69, 4, 1, "", "lr_scale_func_adam"], [70, 4, 1, "", "lr_scale_func_sgd"], [71, 4, 1, "", "scaled_parameters"]], "unit_scaling.optim.Adam": [[65, 3, 1, "", "add_param_group"], [65, 3, 1, "", "load_state_dict"], [65, 3, 1, "", "register_load_state_dict_post_hook"], [65, 3, 1, "", "register_load_state_dict_pre_hook"], [65, 3, 1, "", "register_state_dict_post_hook"], [65, 3, 1, "", "register_state_dict_pre_hook"], [65, 3, 1, "", "register_step_post_hook"], [65, 3, 1, "", "register_step_pre_hook"], [65, 3, 1, "", "state_dict"], [65, 3, 1, "", "step"], [65, 3, 1, "", "zero_grad"]], "unit_scaling.optim.AdamW": [[66, 3, 1, "", "add_param_group"], [66, 3, 1, "", "load_state_dict"], [66, 3, 1, "", "register_load_state_dict_post_hook"], [66, 3, 1, "", "register_load_state_dict_pre_hook"], [66, 3, 1, "", "register_state_dict_post_hook"], [66, 3, 1, "", "register_state_dict_pre_hook"], [66, 3, 1, "", "register_step_post_hook"], [66, 3, 1, "", "register_step_pre_hook"], [66, 3, 1, "", "state_dict"], [66, 3, 1, "", "step"], [66, 3, 1, "", "zero_grad"]], "unit_scaling.optim.SGD": [[67, 3, 1, "", "add_param_group"], [67, 3, 1, "", "load_state_dict"], [67, 3, 1, "", "register_load_state_dict_post_hook"], [67, 3, 1, "", "register_load_state_dict_pre_hook"], [67, 3, 1, "", "register_state_dict_post_hook"], [67, 3, 1, "", "register_state_dict_pre_hook"], [67, 3, 1, "", "register_step_post_hook"], [67, 3, 1, "", "register_step_pre_hook"], [67, 3, 1, "", "state_dict"], [67, 3, 1, "", "step"], [67, 3, 1, "", "zero_grad"]], "unit_scaling.parameter": [[73, 1, 1, "", "OrderedDict"], [74, 4, 1, "", "Parameter"], [75, 1, 1, "", "ParameterData"], [76, 1, 1, "", "Protocol"], [77, 1, 1, "", "Tensor"], [78, 4, 1, "", "has_parameter_data"]], "unit_scaling.parameter.OrderedDict": [[73, 3, 1, "", "clear"], [73, 3, 1, "", "copy"], [73, 3, 1, "", "fromkeys"], [73, 3, 1, "", "get"], [73, 3, 1, "", "items"], [73, 3, 1, "", "keys"], [73, 3, 1, "", "move_to_end"], [73, 3, 1, "", "pop"], [73, 3, 1, "", "popitem"], [73, 3, 1, "", "setdefault"], [73, 3, 1, "", "update"], [73, 3, 1, "", "values"]], "unit_scaling.parameter.Tensor": [[77, 2, 1, "", "H"], [77, 2, 1, "", "T"], [77, 3, 1, "", "abs"], [77, 3, 1, "", "abs_"], [77, 3, 1, "", "absolute"], [77, 3, 1, "", "absolute_"], [77, 3, 1, "", "acos"], [77, 3, 1, "", "acos_"], [77, 3, 1, "", "acosh"], [77, 3, 1, "", "acosh_"], [77, 3, 1, "", "add"], [77, 3, 1, "", "add_"], [77, 3, 1, "", "addbmm"], [77, 3, 1, "", "addbmm_"], [77, 3, 1, "", "addcdiv"], [77, 3, 1, "", "addcdiv_"], [77, 3, 1, "", "addcmul"], [77, 3, 1, "", "addcmul_"], [77, 3, 1, "", "addmm"], [77, 3, 1, "", "addmm_"], [77, 3, 1, "", "addmv"], [77, 3, 1, "", "addmv_"], [77, 3, 1, "", "addr"], [77, 3, 1, "", "addr_"], [77, 3, 1, "", "adjoint"], [77, 3, 1, "", "align_as"], [77, 3, 1, "", "align_to"], [77, 3, 1, "", "all"], [77, 3, 1, "", "allclose"], [77, 3, 1, "", "amax"], [77, 3, 1, "", "amin"], [77, 3, 1, "", "aminmax"], [77, 3, 1, "", "angle"], [77, 3, 1, "", "any"], [77, 3, 1, "", "apply_"], [77, 3, 1, "", "arccos"], [77, 3, 1, "", "arccos_"], [77, 3, 1, "", "arccosh"], [77, 3, 1, "", "arccosh_"], [77, 3, 1, "", "arcsin"], [77, 3, 1, "", "arcsin_"], [77, 3, 1, "", "arcsinh"], [77, 3, 1, "", "arcsinh_"], [77, 3, 1, "", "arctan"], [77, 3, 1, "", "arctan2"], [77, 3, 1, "", "arctan2_"], [77, 3, 1, "", "arctan_"], [77, 3, 1, "", "arctanh"], [77, 3, 1, "", "arctanh_"], [77, 3, 1, "", "argmax"], [77, 3, 1, "", "argmin"], [77, 3, 1, "", "argsort"], [77, 3, 1, "", "argwhere"], [77, 3, 1, "", "as_strided"], [77, 3, 1, "", "as_strided_"], [77, 3, 1, "", "as_strided_scatter"], [77, 3, 1, "", "as_subclass"], [77, 3, 1, "", "asin"], [77, 3, 1, "", "asin_"], [77, 3, 1, "", "asinh"], [77, 3, 1, "", "asinh_"], [77, 3, 1, "", "atan"], [77, 3, 1, "", "atan2"], [77, 3, 1, "", "atan2_"], [77, 3, 1, "", "atan_"], [77, 3, 1, "", "atanh"], [77, 3, 1, "", "atanh_"], [77, 3, 1, "", "backward"], [77, 3, 1, "", "baddbmm"], [77, 3, 1, "", "baddbmm_"], [77, 3, 1, "", "bernoulli"], [77, 3, 1, "", "bernoulli_"], [77, 3, 1, "", "bfloat16"], [77, 3, 1, "", "bincount"], [77, 3, 1, "", "bitwise_and"], [77, 3, 1, "", "bitwise_and_"], [77, 3, 1, "", "bitwise_left_shift"], [77, 3, 1, "", "bitwise_left_shift_"], [77, 3, 1, "", "bitwise_not"], [77, 3, 1, "", "bitwise_not_"], [77, 3, 1, "", "bitwise_or"], [77, 3, 1, "", "bitwise_or_"], [77, 3, 1, "", "bitwise_right_shift"], [77, 3, 1, "", "bitwise_right_shift_"], [77, 3, 1, "", "bitwise_xor"], [77, 3, 1, "", "bitwise_xor_"], [77, 3, 1, "", "bmm"], [77, 3, 1, "", "bool"], [77, 3, 1, "", "broadcast_to"], [77, 3, 1, "", "byte"], [77, 3, 1, "", "cauchy_"], [77, 3, 1, "", "cdouble"], [77, 3, 1, "", "ceil"], [77, 3, 1, "", "ceil_"], [77, 3, 1, "", "cfloat"], [77, 3, 1, "", "chalf"], [77, 3, 1, "", "char"], [77, 3, 1, "", "cholesky"], [77, 3, 1, "", "cholesky_inverse"], [77, 3, 1, "", "cholesky_solve"], [77, 3, 1, "", "chunk"], [77, 3, 1, "", "clamp"], [77, 3, 1, "", "clamp_"], [77, 3, 1, "", "clip"], [77, 3, 1, "", "clip_"], [77, 3, 1, "", "clone"], [77, 3, 1, "", "coalesce"], [77, 3, 1, "", "col_indices"], [77, 3, 1, "", "conj"], [77, 3, 1, "", "conj_physical"], [77, 3, 1, "", "conj_physical_"], [77, 3, 1, "", "contiguous"], [77, 3, 1, "", "copy_"], [77, 3, 1, "", "copysign"], [77, 3, 1, "", "copysign_"], [77, 3, 1, "", "corrcoef"], [77, 3, 1, "", "cos"], [77, 3, 1, "", "cos_"], [77, 3, 1, "", "cosh"], [77, 3, 1, "", "cosh_"], [77, 3, 1, "", "count_nonzero"], [77, 3, 1, "", "cov"], [77, 3, 1, "", "cpu"], [77, 3, 1, "", "cross"], [77, 3, 1, "", "crow_indices"], [77, 3, 1, "", "cuda"], [77, 3, 1, "", "cummax"], [77, 3, 1, "", "cummin"], [77, 3, 1, "", "cumprod"], [77, 3, 1, "", "cumprod_"], [77, 3, 1, "", "cumsum"], [77, 3, 1, "", "cumsum_"], [77, 3, 1, "", "data_ptr"], [77, 3, 1, "", "deg2rad"], [77, 3, 1, "", "deg2rad_"], [77, 3, 1, "", "dense_dim"], [77, 3, 1, "", "dequantize"], [77, 3, 1, "", "det"], [77, 3, 1, "", "detach"], [77, 3, 1, "", "detach_"], [77, 2, 1, "", "device"], [77, 3, 1, "", "diag"], [77, 3, 1, "", "diag_embed"], [77, 3, 1, "", "diagflat"], [77, 3, 1, "", "diagonal"], [77, 3, 1, "", "diagonal_scatter"], [77, 3, 1, "", "diff"], [77, 3, 1, "", "digamma"], [77, 3, 1, "", "digamma_"], [77, 3, 1, "", "dim"], [77, 3, 1, "", "dim_order"], [77, 3, 1, "", "dist"], [77, 3, 1, "", "div"], [77, 3, 1, "", "div_"], [77, 3, 1, "", "divide"], [77, 3, 1, "", "divide_"], [77, 3, 1, "", "dot"], [77, 3, 1, "", "double"], [77, 3, 1, "", "dsplit"], [77, 3, 1, "", "element_size"], [77, 3, 1, "", "eq"], [77, 3, 1, "", "eq_"], [77, 3, 1, "", "equal"], [77, 3, 1, "", "erf"], [77, 3, 1, "", "erf_"], [77, 3, 1, "", "erfc"], [77, 3, 1, "", "erfc_"], [77, 3, 1, "", "erfinv"], [77, 3, 1, "", "erfinv_"], [77, 3, 1, "", "exp"], [77, 3, 1, "", "exp2"], [77, 3, 1, "", "exp2_"], [77, 3, 1, "", "exp_"], [77, 3, 1, "", "expand"], [77, 3, 1, "", "expand_as"], [77, 3, 1, "", "expm1"], [77, 3, 1, "", "expm1_"], [77, 3, 1, "", "exponential_"], [77, 3, 1, "", "fill_"], [77, 3, 1, "", "fill_diagonal_"], [77, 3, 1, "", "fix"], [77, 3, 1, "", "fix_"], [77, 3, 1, "", "flatten"], [77, 3, 1, "", "flip"], [77, 3, 1, "", "fliplr"], [77, 3, 1, "", "flipud"], [77, 3, 1, "", "float"], [77, 3, 1, "", "float_power"], [77, 3, 1, "", "float_power_"], [77, 3, 1, "", "floor"], [77, 3, 1, "", "floor_"], [77, 3, 1, "", "floor_divide"], [77, 3, 1, "", "floor_divide_"], [77, 3, 1, "", "fmax"], [77, 3, 1, "", "fmin"], [77, 3, 1, "", "fmod"], [77, 3, 1, "", "fmod_"], [77, 3, 1, "", "frac"], [77, 3, 1, "", "frac_"], [77, 3, 1, "", "frexp"], [77, 3, 1, "", "gather"], [77, 3, 1, "", "gcd"], [77, 3, 1, "", "gcd_"], [77, 3, 1, "", "ge"], [77, 3, 1, "", "ge_"], [77, 3, 1, "", "geometric_"], [77, 3, 1, "", "geqrf"], [77, 3, 1, "", "ger"], [77, 3, 1, "", "get_device"], [77, 2, 1, "", "grad"], [77, 3, 1, "", "greater"], [77, 3, 1, "", "greater_"], [77, 3, 1, "", "greater_equal"], [77, 3, 1, "", "greater_equal_"], [77, 3, 1, "", "gt"], [77, 3, 1, "", "gt_"], [77, 3, 1, "", "half"], [77, 3, 1, "", "hardshrink"], [77, 3, 1, "", "has_names"], [77, 3, 1, "", "heaviside"], [77, 3, 1, "", "heaviside_"], [77, 3, 1, "", "histc"], [77, 3, 1, "", "histogram"], [77, 3, 1, "", "hsplit"], [77, 3, 1, "", "hypot"], [77, 3, 1, "", "hypot_"], [77, 3, 1, "", "i0"], [77, 3, 1, "", "i0_"], [77, 3, 1, "", "igamma"], [77, 3, 1, "", "igamma_"], [77, 3, 1, "", "igammac"], [77, 3, 1, "", "igammac_"], [77, 2, 1, "", "imag"], [77, 3, 1, "", "index_add"], [77, 3, 1, "", "index_add_"], [77, 3, 1, "", "index_copy"], [77, 3, 1, "", "index_copy_"], [77, 3, 1, "", "index_fill"], [77, 3, 1, "", "index_fill_"], [77, 3, 1, "", "index_put"], [77, 3, 1, "", "index_put_"], [77, 3, 1, "", "index_reduce_"], [77, 3, 1, "", "index_select"], [77, 3, 1, "", "indices"], [77, 3, 1, "", "inner"], [77, 3, 1, "", "int"], [77, 3, 1, "", "int_repr"], [77, 3, 1, "", "inverse"], [77, 3, 1, "", "ipu"], [77, 3, 1, "", "is_coalesced"], [77, 3, 1, "", "is_complex"], [77, 3, 1, "", "is_conj"], [77, 3, 1, "", "is_contiguous"], [77, 2, 1, "", "is_cpu"], [77, 2, 1, "", "is_cuda"], [77, 3, 1, "", "is_floating_point"], [77, 3, 1, "", "is_inference"], [77, 2, 1, "", "is_ipu"], [77, 2, 1, "", "is_leaf"], [77, 2, 1, "", "is_meta"], [77, 2, 1, "", "is_mps"], [77, 3, 1, "", "is_neg"], [77, 3, 1, "", "is_pinned"], [77, 2, 1, "", "is_quantized"], [77, 3, 1, "", "is_set_to"], [77, 3, 1, "", "is_shared"], [77, 3, 1, "", "is_signed"], [77, 2, 1, "", "is_sparse"], [77, 2, 1, "", "is_sparse_csr"], [77, 2, 1, "", "is_xla"], [77, 2, 1, "", "is_xpu"], [77, 3, 1, "", "isclose"], [77, 3, 1, "", "isfinite"], [77, 3, 1, "", "isinf"], [77, 3, 1, "", "isnan"], [77, 3, 1, "", "isneginf"], [77, 3, 1, "", "isposinf"], [77, 3, 1, "", "isreal"], [77, 3, 1, "", "istft"], [77, 3, 1, "", "item"], [77, 2, 1, "", "itemsize"], [77, 3, 1, "", "kron"], [77, 3, 1, "", "kthvalue"], [77, 3, 1, "", "lcm"], [77, 3, 1, "", "lcm_"], [77, 3, 1, "", "ldexp"], [77, 3, 1, "", "ldexp_"], [77, 3, 1, "", "le"], [77, 3, 1, "", "le_"], [77, 3, 1, "", "lerp"], [77, 3, 1, "", "lerp_"], [77, 3, 1, "", "less"], [77, 3, 1, "", "less_"], [77, 3, 1, "", "less_equal"], [77, 3, 1, "", "less_equal_"], [77, 3, 1, "", "lgamma"], [77, 3, 1, "", "lgamma_"], [77, 3, 1, "", "log"], [77, 3, 1, "", "log10"], [77, 3, 1, "", "log10_"], [77, 3, 1, "", "log1p"], [77, 3, 1, "", "log1p_"], [77, 3, 1, "", "log2"], [77, 3, 1, "", "log2_"], [77, 3, 1, "", "log_"], [77, 3, 1, "", "log_normal_"], [77, 3, 1, "", "logaddexp"], [77, 3, 1, "", "logaddexp2"], [77, 3, 1, "", "logcumsumexp"], [77, 3, 1, "", "logdet"], [77, 3, 1, "", "logical_and"], [77, 3, 1, "", "logical_and_"], [77, 3, 1, "", "logical_not"], [77, 3, 1, "", "logical_not_"], [77, 3, 1, "", "logical_or"], [77, 3, 1, "", "logical_or_"], [77, 3, 1, "", "logical_xor"], [77, 3, 1, "", "logical_xor_"], [77, 3, 1, "", "logit"], [77, 3, 1, "", "logit_"], [77, 3, 1, "", "logsumexp"], [77, 3, 1, "", "long"], [77, 3, 1, "", "lt"], [77, 3, 1, "", "lt_"], [77, 3, 1, "", "lu"], [77, 3, 1, "", "lu_solve"], [77, 2, 1, "", "mH"], [77, 2, 1, "", "mT"], [77, 3, 1, "", "map_"], [77, 3, 1, "", "masked_fill"], [77, 3, 1, "", "masked_fill_"], [77, 3, 1, "", "masked_scatter"], [77, 3, 1, "", "masked_scatter_"], [77, 3, 1, "", "masked_select"], [77, 3, 1, "", "matmul"], [77, 3, 1, "", "matrix_exp"], [77, 3, 1, "", "matrix_power"], [77, 3, 1, "", "max"], [77, 3, 1, "", "maximum"], [77, 3, 1, "", "mean"], [77, 3, 1, "", "median"], [77, 3, 1, "", "min"], [77, 3, 1, "", "minimum"], [77, 3, 1, "", "mm"], [77, 3, 1, "", "mode"], [77, 3, 1, "", "module_load"], [77, 3, 1, "", "moveaxis"], [77, 3, 1, "", "movedim"], [77, 3, 1, "", "msort"], [77, 3, 1, "", "mtia"], [77, 3, 1, "", "mul"], [77, 3, 1, "", "mul_"], [77, 3, 1, "", "multinomial"], [77, 3, 1, "", "multiply"], [77, 3, 1, "", "multiply_"], [77, 3, 1, "", "mv"], [77, 3, 1, "", "mvlgamma"], [77, 3, 1, "", "mvlgamma_"], [77, 2, 1, "", "names"], [77, 3, 1, "", "nan_to_num"], [77, 3, 1, "", "nan_to_num_"], [77, 3, 1, "", "nanmean"], [77, 3, 1, "", "nanmedian"], [77, 3, 1, "", "nanquantile"], [77, 3, 1, "", "nansum"], [77, 3, 1, "", "narrow"], [77, 3, 1, "", "narrow_copy"], [77, 2, 1, "", "nbytes"], [77, 2, 1, "", "ndim"], [77, 3, 1, "", "ndimension"], [77, 3, 1, "", "ne"], [77, 3, 1, "", "ne_"], [77, 3, 1, "", "neg"], [77, 3, 1, "", "neg_"], [77, 3, 1, "", "negative"], [77, 3, 1, "", "negative_"], [77, 3, 1, "", "nelement"], [77, 3, 1, "", "new_empty"], [77, 3, 1, "", "new_empty_strided"], [77, 3, 1, "", "new_full"], [77, 3, 1, "", "new_ones"], [77, 3, 1, "", "new_tensor"], [77, 3, 1, "", "new_zeros"], [77, 3, 1, "", "nextafter"], [77, 3, 1, "", "nextafter_"], [77, 3, 1, "", "nonzero"], [77, 3, 1, "", "nonzero_static"], [77, 3, 1, "", "norm"], [77, 3, 1, "", "normal_"], [77, 3, 1, "", "not_equal"], [77, 3, 1, "", "not_equal_"], [77, 3, 1, "", "numel"], [77, 3, 1, "", "numpy"], [77, 3, 1, "", "orgqr"], [77, 3, 1, "", "ormqr"], [77, 3, 1, "", "outer"], [77, 3, 1, "", "permute"], [77, 3, 1, "", "pin_memory"], [77, 3, 1, "", "pinverse"], [77, 3, 1, "", "polygamma"], [77, 3, 1, "", "polygamma_"], [77, 3, 1, "", "positive"], [77, 3, 1, "", "pow"], [77, 3, 1, "", "pow_"], [77, 3, 1, "", "prod"], [77, 3, 1, "", "put"], [77, 3, 1, "", "put_"], [77, 3, 1, "", "q_per_channel_axis"], [77, 3, 1, "", "q_per_channel_scales"], [77, 3, 1, "", "q_per_channel_zero_points"], [77, 3, 1, "", "q_scale"], [77, 3, 1, "", "q_zero_point"], [77, 3, 1, "", "qr"], [77, 3, 1, "", "qscheme"], [77, 3, 1, "", "quantile"], [77, 3, 1, "", "rad2deg"], [77, 3, 1, "", "rad2deg_"], [77, 3, 1, "", "random_"], [77, 3, 1, "", "ravel"], [77, 2, 1, "", "real"], [77, 3, 1, "", "reciprocal"], [77, 3, 1, "", "reciprocal_"], [77, 3, 1, "", "record_stream"], [77, 3, 1, "", "refine_names"], [77, 3, 1, "", "register_hook"], [77, 3, 1, "", "register_post_accumulate_grad_hook"], [77, 3, 1, "", "remainder"], [77, 3, 1, "", "remainder_"], [77, 3, 1, "", "rename"], [77, 3, 1, "", "rename_"], [77, 3, 1, "", "renorm"], [77, 3, 1, "", "renorm_"], [77, 3, 1, "", "repeat"], [77, 3, 1, "", "repeat_interleave"], [77, 2, 1, "", "requires_grad"], [77, 3, 1, "", "requires_grad_"], [77, 3, 1, "", "reshape"], [77, 3, 1, "", "reshape_as"], [77, 3, 1, "", "resize_"], [77, 3, 1, "", "resize_as_"], [77, 3, 1, "", "resolve_conj"], [77, 3, 1, "", "resolve_neg"], [77, 3, 1, "", "retain_grad"], [77, 2, 1, "", "retains_grad"], [77, 3, 1, "", "roll"], [77, 3, 1, "", "rot90"], [77, 3, 1, "", "round"], [77, 3, 1, "", "round_"], [77, 3, 1, "", "rsqrt"], [77, 3, 1, "", "rsqrt_"], [77, 3, 1, "", "scatter"], [77, 3, 1, "", "scatter_"], [77, 3, 1, "", "scatter_add"], [77, 3, 1, "", "scatter_add_"], [77, 3, 1, "", "scatter_reduce"], [77, 3, 1, "", "scatter_reduce_"], [77, 3, 1, "", "select"], [77, 3, 1, "", "select_scatter"], [77, 3, 1, "", "set_"], [77, 3, 1, "", "sgn"], [77, 3, 1, "", "sgn_"], [77, 2, 1, "", "shape"], [77, 3, 1, "", "share_memory_"], [77, 3, 1, "", "short"], [77, 3, 1, "", "sigmoid"], [77, 3, 1, "", "sigmoid_"], [77, 3, 1, "", "sign"], [77, 3, 1, "", "sign_"], [77, 3, 1, "", "signbit"], [77, 3, 1, "", "sin"], [77, 3, 1, "", "sin_"], [77, 3, 1, "", "sinc"], [77, 3, 1, "", "sinc_"], [77, 3, 1, "", "sinh"], [77, 3, 1, "", "sinh_"], [77, 3, 1, "", "size"], [77, 3, 1, "", "slice_scatter"], [77, 3, 1, "", "slogdet"], [77, 3, 1, "", "smm"], [77, 3, 1, "", "softmax"], [77, 3, 1, "", "sort"], [77, 3, 1, "", "sparse_dim"], [77, 3, 1, "", "sparse_mask"], [77, 3, 1, "", "sparse_resize_"], [77, 3, 1, "", "sparse_resize_and_clear_"], [77, 3, 1, "", "split"], [77, 3, 1, "", "sqrt"], [77, 3, 1, "", "sqrt_"], [77, 3, 1, "", "square"], [77, 3, 1, "", "square_"], [77, 3, 1, "", "squeeze"], [77, 3, 1, "", "squeeze_"], [77, 3, 1, "", "sspaddmm"], [77, 3, 1, "", "std"], [77, 3, 1, "", "stft"], [77, 3, 1, "", "storage"], [77, 3, 1, "", "storage_offset"], [77, 3, 1, "", "storage_type"], [77, 3, 1, "", "stride"], [77, 3, 1, "", "sub"], [77, 3, 1, "", "sub_"], [77, 3, 1, "", "subtract"], [77, 3, 1, "", "subtract_"], [77, 3, 1, "", "sum"], [77, 3, 1, "", "sum_to_size"], [77, 3, 1, "", "svd"], [77, 3, 1, "", "swapaxes"], [77, 3, 1, "", "swapaxes_"], [77, 3, 1, "", "swapdims"], [77, 3, 1, "", "swapdims_"], [77, 3, 1, "", "t"], [77, 3, 1, "", "t_"], [77, 3, 1, "", "take"], [77, 3, 1, "", "take_along_dim"], [77, 3, 1, "", "tan"], [77, 3, 1, "", "tan_"], [77, 3, 1, "", "tanh"], [77, 3, 1, "", "tanh_"], [77, 3, 1, "", "tensor_split"], [77, 3, 1, "", "tile"], [77, 3, 1, "", "to"], [77, 3, 1, "", "to_dense"], [77, 3, 1, "", "to_mkldnn"], [77, 3, 1, "", "to_padded_tensor"], [77, 3, 1, "", "to_sparse"], [77, 3, 1, "", "to_sparse_bsc"], [77, 3, 1, "", "to_sparse_bsr"], [77, 3, 1, "", "to_sparse_coo"], [77, 3, 1, "", "to_sparse_csc"], [77, 3, 1, "", "to_sparse_csr"], [77, 3, 1, "", "tolist"], [77, 3, 1, "", "topk"], [77, 3, 1, "", "trace"], [77, 3, 1, "", "transpose"], [77, 3, 1, "", "transpose_"], [77, 3, 1, "", "triangular_solve"], [77, 3, 1, "", "tril"], [77, 3, 1, "", "tril_"], [77, 3, 1, "", "triu"], [77, 3, 1, "", "triu_"], [77, 3, 1, "", "true_divide"], [77, 3, 1, "", "true_divide_"], [77, 3, 1, "", "trunc"], [77, 3, 1, "", "trunc_"], [77, 3, 1, "", "type"], [77, 3, 1, "", "type_as"], [77, 3, 1, "", "unbind"], [77, 3, 1, "", "unflatten"], [77, 3, 1, "", "unfold"], [77, 3, 1, "", "uniform_"], [77, 3, 1, "", "unique"], [77, 3, 1, "", "unique_consecutive"], [77, 3, 1, "", "unsafe_chunk"], [77, 3, 1, "", "unsafe_split"], [77, 3, 1, "", "unsqueeze"], [77, 3, 1, "", "unsqueeze_"], [77, 3, 1, "", "untyped_storage"], [77, 3, 1, "", "values"], [77, 3, 1, "", "var"], [77, 3, 1, "", "vdot"], [77, 3, 1, "", "view"], [77, 3, 1, "", "view_as"], [77, 3, 1, "", "vsplit"], [77, 3, 1, "", "where"], [77, 3, 1, "", "xlogy"], [77, 3, 1, "", "xlogy_"], [77, 3, 1, "", "xpu"], [77, 3, 1, "", "zero_"]], "unit_scaling.scale": [[80, 4, 1, "", "scale_bwd"], [81, 4, 1, "", "scale_fwd"]], "unit_scaling.transforms": [[84, 1, 1, "", "Metrics"], [85, 4, 1, "", "compile"], [86, 4, 1, "", "prune_non_float_tensors"], [87, 4, 1, "", "prune_same_scale_tensors"], [88, 4, 1, "", "prune_selected_nodes"], [89, 4, 1, "", "simulate_format"], [90, 4, 1, "", "simulate_fp8"], [91, 4, 1, "", "track_scales"], [92, 4, 1, "", "unit_scale"], [93, 0, 0, "-", "utils"]], "unit_scaling.transforms.Metrics": [[84, 1, 1, "", "Data"]], "unit_scaling.transforms.utils": [[94, 4, 1, "", "apply_transform"], [95, 4, 1, "", "patch_to_expand_modules"], [96, 4, 1, "", "replace_node_with_function"], [97, 4, 1, "", "torch_nn_modules_to_user_modules"]], "unit_scaling.utils": [[99, 1, 1, "", "ScalePair"], [100, 1, 1, "", "ScaleTracker"], [101, 1, 1, "", "ScaleTrackingInterpreter"], [102, 4, 1, "", "analyse_module"]], "unit_scaling.utils.ScaleTracker": [[100, 3, 1, "", "backward"], [100, 3, 1, "", "jvp"], [100, 3, 1, "", "mark_dirty"], [100, 3, 1, "", "mark_non_differentiable"], [100, 3, 1, "", "save_for_backward"], [100, 3, 1, "", "save_for_forward"], [100, 3, 1, "", "set_materialize_grads"], [100, 3, 1, "", "setup_context"], [100, 3, 1, "", "vjp"], [100, 3, 1, "", "vmap"]], "unit_scaling.utils.ScaleTrackingInterpreter": [[101, 3, 1, "", "boxed_run"], [101, 3, 1, "", "call_function"], [101, 3, 1, "", "call_method"], [101, 3, 1, "", "call_module"], [101, 3, 1, "", "fetch_args_kwargs_from_env"], [101, 3, 1, "", "fetch_attr"], [101, 3, 1, "", "get_attr"], [101, 3, 1, "", "map_nodes_to_values"], [101, 3, 1, "", "output"], [101, 3, 1, "", "placeholder"], [101, 3, 1, "", "run"], [101, 3, 1, "", "run_node"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "method", "Python method"], "4": ["py", "function", "Python function"], "5": ["py", "property", "Python property"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute", "3": "py:method", "4": "py:function", "5": "py:property"}, "terms": {"": [5, 13, 30, 31, 32, 33, 52, 53, 54, 60, 65, 66, 67, 73, 77, 91, 100, 101, 102, 107], "0": [2, 3, 6, 7, 8, 9, 12, 16, 17, 18, 19, 36, 37, 39, 41, 45, 46, 47, 48, 49, 50, 52, 56, 57, 58, 60, 61, 62, 63, 65, 66, 67, 71, 76, 77, 82, 100, 102, 107], "00": 77, "000": 107, "0000": [7, 49, 77], "0000e": 77, "0005": 77, "001": [65, 66, 67], "0036": 77, "0047": 77, "00894": 107, "00902": [102, 107], "00904": 102, "01": [65, 66, 67, 77, 107], "0100": 77, "012766935862600803": 77, "018": [102, 107], "0182": 102, "0200": 77, "0202": 45, "0237": 49, "0251": 7, "03": 107, "0300": 77, "0309": 7, "0310": 77, "0310j": 77, "0334": 77, "0334j": 77, "0364": 7, "0411": 77, "044715": [8, 50], "04745": 19, "05": [9, 15, 51, 59, 77, 87, 107], "0528": 49, "0545e": 77, "0558": 77, "0584": 77, "0633": 77, "0633j": 77, "0635": 7, "0638": 77, "0736": 45, "0748": 7, "0779": 77, "0793": 77, "08": 77, "08909505605697632": 77, "0949e": 77, "0985": 45, "0994": 45, "1": [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 36, 39, 45, 46, 47, 49, 50, 54, 56, 57, 58, 60, 61, 62, 63, 65, 66, 67, 71, 77, 82, 87, 100, 102, 104, 107], "10": [7, 9, 14, 36, 45, 49, 74, 77, 102, 107], "100": [3, 36, 47, 77], "1000": [7, 36], "10000": 21, "1000000000": 77, "1007": 77, "103": [21, 24, 103], "1055": 77, "1057999968": 77, "106": 77, "1061112040": 77, "1064483442": 77, "1069546515": 77, "1071760287": 77, "1073668768": 77, "1082790149": 77, "1084397505": 77, "1084649136": 77, "1088634448": 77, "1089989247": 77, "1097310419": 77, "10_000": 21, "10mm": [65, 66, 67], "11": 77, "1101533110": 77, "1105482831": 77, "110mm": [65, 66, 67], "111": 77, "112": 77, "1123489973": 77, "1124191867": 77, "1151": 7, "1158": 77, "117": 77, "12": 77, "127": 77, "128": [10, 11, 60, 77], "13": 102, "13mm": [65, 66, 67], "14": 77, "141592": 77, "1416": 77, "1450": 45, "146": 77, "147": 77, "1472": 21, "15": 77, "152": 77, "1520": 77, "1535": 7, "154": 77, "15mm": 67, "16": [2, 6, 46, 77, 87, 107], "1610": 45, "1611": 77, "165": 77, "1655": 7, "1685": 7, "17": 45, "177": [77, 107], "18": [45, 77], "182": 77, "185": 77, "1867": 49, "1868": 45, "188": 77, "189": 77, "19": 45, "190": 77, "191": 77, "192": 77, "198": 107, "1d": [2, 3, 46], "1e": [9, 15, 51, 59, 65, 66, 67, 77, 107], "2": [2, 6, 7, 8, 9, 15, 16, 17, 37, 45, 49, 50, 52, 53, 54, 60, 65, 66, 67, 77, 87, 100, 102, 107], "20": [2, 5, 6, 9, 10, 11, 45, 46, 77], "2002": 19, "202": 77, "2022": 90, "2023": 104, "203": 77, "204": [102, 107], "205": 77, "206": 77, "21": 45, "2109": 77, "22": 77, "226": 77, "227": 77, "23": 77, "2300": 77, "2316": 77, "2326": 77, "235": [77, 102], "236": 102, "2397": 77, "240": 77, "2419": 49, "243": 77, "2438": 49, "246": 77, "25": 77, "252": 77, "253": 77, "254": 77, "2569": 77, "2678": 7, "27": 77, "28": 77, "288": 102, "289": 107, "2976": 77, "2_t": [65, 66], "2d": [3, 77], "2nd": 77, "3": [2, 3, 7, 8, 9, 17, 45, 47, 49, 50, 60, 65, 66, 67, 77, 104, 107], "30": [10, 11, 46, 77], "3000": 7, "31": 102, "310": 102, "3100": 77, "32": [60, 77], "3216": 45, "322": 107, "33": [2, 46], "3310": 77, "337": 102, "3385": 49, "3448": 7, "3497": 45, "3506": 45, "3553": 77, "3553j": 77, "36": 77, "3616": 7, "3618": 49, "3667": 45, "3672": 45, "3677": 7, "3743": 45, "38": 107, "3925": 45, "3930": 45, "3944": 45, "3e": 107, "3rd": 77, "4": [7, 13, 45, 49, 77, 100, 102, 107], "4004": 7, "4022": 45, "4090": 45, "41": 77, "4161": 49, "43": 77, "4362": 7, "44": 77, "45": 107, "4600": 77, "4711": 77, "4724": 77, "48": 107, "4842e": 77, "4926": 77, "4970": 7, "4999": 77, "4pt": [65, 66, 67], "5": [3, 5, 6, 7, 8, 9, 15, 36, 45, 46, 47, 48, 49, 50, 52, 60, 65, 66, 67, 77, 107], "50": [2, 107], "5044": 77, "51": 102, "5138": 77, "5158": 77, "52587890625e": 87, "53": 77, "5316": 77, "5316j": 77, "5384": 49, "54": 102, "5415473580360413": 77, "544": 76, "5445": 77, "5503": 77, "5609": 49, "5617": 77, "5765e": 77, "578": [102, 107], "5803": 7, "5811": 45, "59": 77, "5mm": [65, 66, 67], "6": [7, 45, 77, 102], "60": 107, "60521": 77, "6056": 45, "6108": 49, "6123": 77, "6123j": 77, "6147": 45, "6245": 45, "6246": 49, "6262": 49, "63": 77, "64": [5, 60, 107], "6431": 7, "6492": 77, "6550": 77, "6606": 77, "6753": 49, "6778": 7, "6902": 7, "693": 107, "6969": 7, "6971": 45, "6e": 107, "7": [45, 77], "706": 107, "707": 107, "7089": 7, "716": 107, "7172": 7, "72": 77, "7265": 7, "729": 107, "7383": 49, "7471": 49, "7472": 77, "7472j": 77, "7695": 45, "7724": 45, "7729271650314331": 77, "7761": 49, "7794": 49, "7845": 45, "7895": 7, "7896": 77, "7896j": 77, "8": [45, 60, 65, 66, 77, 107], "8017": 45, "8119": 77, "8119j": 77, "8182e": 77, "83": 107, "8303": 77, "84": 107, "8490": 49, "8499": 77, "8499j": 77, "8612": 49, "8649": 77, "8649j": 77, "867061780": 77, "87": 77, "8720": 49, "89": 77, "9": [7, 49, 65, 66, 67, 77], "9124": 7, "93": 77, "9315": 7, "9400": 7, "9417": 77, "9482": 77, "95": 77, "9607": 77, "9625": 49, "9666": 49, "9678": 77, "9685": 7, "9732": 45, "9751": 49, "979": 107, "9853": 77, "9897": 7, "9902": 45, "9913": 77, "9913j": 77, "999": [65, 66, 102, 107], "A": [3, 4, 5, 6, 7, 12, 13, 17, 18, 19, 47, 49, 54, 60, 63, 65, 66, 67, 77, 84, 85, 104, 107], "As": [77, 90, 107], "At": 2, "But": 77, "By": [3, 47, 77, 92, 95, 97, 107], "For": [0, 2, 7, 9, 24, 36, 39, 46, 54, 60, 65, 66, 67, 71, 77, 82, 91, 100, 103, 104, 107], "If": [2, 3, 6, 7, 9, 10, 11, 39, 46, 47, 48, 49, 52, 53, 54, 60, 63, 65, 66, 67, 73, 77, 82, 89, 90, 100, 101], "In": [2, 8, 10, 11, 16, 17, 23, 38, 45, 46, 50, 52, 53, 54, 58, 60, 61, 63, 73, 77, 92, 95, 100, 107], "It": [2, 3, 5, 24, 60, 63, 65, 66, 67, 77, 100, 103, 107], "Its": [65, 66, 67], "NOT": [65, 66, 67], "No": 100, "On": [2, 5, 10, 11, 54, 65, 66, 67], "One": 77, "Such": 76, "The": [0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 22, 23, 27, 38, 39, 41, 45, 46, 47, 49, 50, 52, 53, 54, 58, 60, 61, 63, 65, 66, 67, 74, 77, 82, 85, 86, 87, 89, 90, 91, 92, 94, 100, 101, 104, 107], "There": [60, 77, 100, 107], "These": [77, 91, 107], "To": [60, 77, 97, 101, 104, 107], "Will": 101, "_": [2, 6, 60, 65, 66, 67, 100], "__": 67, "__init__": [60, 102, 107], "_c": [102, 107], "_channel": [2, 46], "_dim": 7, "_dynamo": [85, 94, 95], "_featur": [10, 11, 52, 53], "_freez": 7, "_i": 45, "_index": 3, "_j": 2, "_modul": 94, "_nn": [102, 107], "_nnz": 77, "_shape": 9, "_size": 2, "_t": 67, "_tau": 18, "_tensor": 77, "_weight": 7, "a_dual": 100, "ab": [19, 77], "abl": [77, 92, 107], "about": [77, 84], "abov": [5, 60, 77, 107], "abs_": 77, "abs_max": [23, 84], "abs_mean": [23, 84], "abs_min": [23, 84], "absolut": [41, 77], "absolute_": 77, "accept": [3, 5, 77, 100], "access": [65, 66, 67, 77, 91, 100], "accompani": 96, "accord": [60, 77, 92], "accordingli": 17, "accumul": 77, "aco": 77, "acos_": 77, "acosh": 77, "acosh_": 77, "across": 77, "act": 76, "activ": [9, 16, 61, 89, 107], "actual": [65, 66, 67, 77], "ad": [0, 2, 5, 9, 15, 60, 65, 66, 67, 77, 91, 105, 107], "adam": [64, 66, 69, 71, 104], "adam_lr_scale_func": 71, "adamw": [64, 69, 104], "adapt": [6, 107], "add": [2, 4, 56, 65, 66, 67, 77, 92, 104, 105, 107], "add_": 77, "add_param_group": [65, 66, 67], "addbmm": 77, "addbmm_": 77, "addcdiv": 77, "addcdiv_": 77, "addcmul": 77, "addcmul_": 77, "addit": [9, 10, 11, 17, 45, 52, 53, 65, 66, 67, 77, 91, 107], "addition": 77, "addmm": 77, "addmm_": 77, "addmv": 77, "addmv_": 77, "addr": 77, "addr_": 77, "address": [77, 85, 107], "adjoint": 77, "adjust": 17, "advanc": 34, "advantag": 107, "advis": 77, "affect": 77, "affin": [9, 77], "after": [4, 5, 27, 54, 65, 66, 67, 77, 92, 100], "against": 104, "agnost": 77, "aim": 107, "al": 90, "algorithm": [2, 8, 46, 60, 65, 66], "alia": 77, "alias": 25, "align": [3, 47, 60, 65, 66, 67, 77], "align_a": 77, "align_to": 77, "all": [2, 4, 7, 10, 11, 22, 23, 24, 39, 49, 60, 63, 65, 66, 67, 73, 77, 82, 86, 87, 88, 92, 100, 103, 107], "allclos": 77, "alloc": 77, "allow": [3, 5, 58, 77, 85, 107], "allow_non_unit_scaling_param": [65, 66, 67, 71], "alon": 92, "along": [17, 21, 63, 65, 66, 67, 77, 107], "alongsid": 107, "alpha": [36, 45, 77], "alreadi": [65, 66, 67, 77], "also": [0, 2, 3, 14, 16, 57, 61, 65, 66, 74, 75, 77, 91, 92, 100, 104, 107], "altern": [5, 39, 64, 82, 107], "although": [105, 107], "altogeth": [65, 66, 67], "alwai": [60, 77, 91, 100], "amax": 77, "amean": [2, 8, 10, 11, 16, 17, 38, 45, 46, 50, 52, 53, 54, 61, 63, 104], "amin": 77, "aminmax": 77, "amount": [2, 3, 47], "amsgrad": [65, 66], "an": [2, 3, 4, 5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 21, 22, 23, 24, 32, 38, 39, 46, 48, 49, 54, 58, 60, 65, 66, 67, 71, 73, 74, 77, 82, 86, 87, 88, 89, 91, 92, 99, 100, 101, 103, 104, 107], "analog": 77, "analys": [20, 102, 107], "analyse_modul": [104, 107], "analysi": [91, 92, 102, 103, 104, 107], "analyt": 49, "angl": 77, "ani": [2, 5, 6, 7, 8, 9, 10, 11, 16, 17, 23, 24, 32, 46, 52, 53, 60, 64, 65, 66, 67, 71, 77, 88, 91, 92, 94, 95, 96, 100, 101, 102, 103, 104, 105, 107], "annot": [14, 22, 74, 102], "annotated_cod": 107, "anoth": [7, 77], "answer": 107, "anticip": 92, "anyon": 105, "api": [77, 101, 104, 107], "appear": 77, "append": [4, 5, 18, 54, 77, 107], "appendix": [39, 82], "appli": [0, 2, 3, 5, 8, 9, 10, 11, 12, 15, 16, 17, 21, 22, 23, 27, 38, 45, 46, 47, 48, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 71, 77, 80, 81, 85, 86, 87, 92, 94, 95, 100, 107], "applic": [3, 47, 77, 100], "apply_": 77, "apply_constraint": 104, "apply_transform": 104, "approach": [92, 94, 106, 107], "appropri": [11, 71, 77], "approxim": [8, 16, 50, 61, 107], "ar": [0, 2, 3, 4, 5, 6, 9, 10, 11, 15, 17, 23, 24, 39, 47, 49, 54, 58, 60, 65, 66, 67, 73, 76, 77, 82, 84, 87, 89, 90, 91, 92, 94, 100, 101, 103, 106, 107], "arang": 77, "arbitrari": [7, 22, 23, 24, 49, 60, 77, 92, 99, 103, 107], "arbitrarili": 77, "arcco": 77, "arccos_": 77, "arccosh": 77, "arccosh_": 77, "architectur": [3, 47], "arcsin": 77, "arcsin_": 77, "arcsinh": 77, "arcsinh_": 77, "arctan": 77, "arctan2": 77, "arctan2_": 77, "arctan_": 77, "arctanh": 77, "arctanh_": 77, "arg": [3, 5, 24, 38, 47, 65, 66, 67, 75, 77, 96, 100, 101, 103], "argmax": 77, "argmin": 77, "args_list": 101, "argsort": 77, "argument": [2, 3, 8, 50, 54, 60, 65, 66, 67, 77, 100, 101, 107], "argwher": 77, "aris": 107, "arithmet": 26, "around": [0, 77, 92, 94, 105], "arrai": 77, "array_lik": 77, "arriv": 107, "arrow": 23, "art": 107, "arxiv": 19, "as_strid": 77, "as_strided_": 77, "as_strided_scatt": 77, "as_subclass": 77, "asin": 77, "asin_": 77, "asinh": 77, "asinh_": 77, "assert": [14, 60, 74], "assign": [3, 77], "assist": 105, "associ": [65, 66, 67, 91], "assum": [0, 30, 31, 32, 33, 107], "assumpt": 107, "async": 77, "asynchron": 77, "atan": 77, "atan2": 77, "atan2_": 77, "atan_": 77, "atanh": 77, "atanh_": 77, "atol": 77, "attach": 77, "attain": 107, "attempt": [60, 107], "attent": [12, 18, 19, 21, 39, 60, 82, 107], "attn": [39, 82], "attn_bia": 60, "attn_mask": [21, 24, 60, 103], "attn_weight": 60, "attribut": [65, 66, 67, 72, 76, 77, 100, 101], "autograd": [52, 53, 54, 65, 66, 67, 77, 100], "automat": [4, 5, 23, 24, 60, 77, 91, 92, 95, 100, 102, 103, 107], "autowrap_funct": 102, "autowrap_modul": 102, "avail": [60, 91, 107], "averag": [3, 39, 47, 65, 66, 82], "avoid": 77, "awar": 77, "aweight": 77, "ax": [23, 24, 103], "axi": [23, 77], "axis0": 77, "axis1": 77, "b": [45, 52, 53, 67, 77, 100], "back": 77, "backend": [2, 46, 60, 85, 94, 95], "backend_1": 94, "backend_2": 94, "backward": [2, 3, 10, 11, 22, 23, 24, 41, 47, 54, 58, 65, 66, 67, 77, 79, 80, 84, 86, 87, 89, 90, 91, 92, 99, 100, 101, 102, 103, 107], "baddbmm": 77, "baddbmm_": 77, "bar": [23, 24, 103], "base": [24, 52, 53, 60, 67, 71, 76, 100, 103, 107], "basic": [92, 107], "batch": [2, 3, 7, 9, 21, 24, 47, 49, 54, 65, 66, 67, 77, 103], "batch1": 77, "batch2": 77, "batch_siz": [21, 24, 100, 103, 107], "becaus": [77, 107], "becom": [2, 3, 5, 47, 77], "been": [22, 23, 27, 77, 86, 87, 92, 100, 107], "befor": [4, 63, 65, 66, 67, 77, 89, 90, 100, 101, 107], "begin": [3, 47, 65, 66, 67, 73, 77, 107], "behav": [58, 65, 66, 67, 77], "behavior": [54, 65, 66, 67, 77, 100], "behavour": 102, "being": [3, 7, 47, 58, 65, 66, 67, 77, 85, 94, 100, 101], "belong": 101, "below": [0, 9, 17, 47, 107], "benchmark": 107, "benefici": 58, "bernoulli": [6, 48, 77], "bernoulli_": 77, "best": [105, 107], "beta": [9, 52, 53, 54, 60, 65, 66, 77, 104, 107], "beta_1": [65, 66], "beta_2": [65, 66], "better": [3, 60, 92], "between": [2, 3, 5, 23, 36, 39, 46, 47, 58, 65, 66, 67, 77, 82, 107], "beyond": [3, 65, 66, 107], "bf": [65, 66, 67], "bf16": 107, "bfloat16": [60, 65, 66, 67, 77], "bia": [2, 9, 10, 11, 14, 46, 51, 52, 53, 60, 74, 102, 107], "bias": [9, 92], "big": [65, 66], "bin": 77, "binari": 77, "bincount": 77, "bit": [41, 77], "bitwise_and": 77, "bitwise_and_": 77, "bitwise_left_shift": 77, "bitwise_left_shift_": 77, "bitwise_not": 77, "bitwise_not_": 77, "bitwise_or": 77, "bitwise_or_": 77, "bitwise_right_shift": 77, "bitwise_right_shift_": 77, "bitwise_xor": 77, "bitwise_xor_": 77, "blend": 3, "block": [2, 77, 107], "blocksiz": 77, "bmm": 77, "bool": [2, 3, 6, 7, 9, 10, 11, 12, 15, 16, 19, 23, 24, 37, 47, 48, 49, 55, 60, 61, 65, 66, 67, 71, 77, 96, 100, 101, 102, 103], "boolean": [9, 15, 60, 77, 100], "booltensor": 77, "both": [2, 9, 39, 46, 54, 60, 77, 82, 100, 102, 107], "bound": 77, "box": [64, 101, 104, 107], "boxed_run": 101, "branch": [19, 56, 57, 58, 107], "breakdown": 107, "brief": 107, "broad": 104, "broadcast": [45, 54, 60, 77], "broadcast_to": 77, "bsc": 77, "bsr": 77, "buffer": 77, "bug": [104, 105, 107], "build": [92, 107], "built": 0, "bwd": [84, 102, 107], "bwd_format": 89, "bwd_scale": 107, "byte": 77, "c": [2, 3, 9, 45, 47, 60, 76, 77, 100], "c_": 2, "cach": [77, 85, 94], "calcul": [9, 39, 68, 69, 70, 82], "call": [5, 6, 22, 23, 24, 60, 65, 66, 67, 77, 85, 86, 87, 89, 90, 91, 92, 94, 95, 96, 97, 100, 101, 102, 103], "call_funct": 101, "call_method": 101, "call_modul": 101, "callabl": [18, 38, 39, 57, 65, 66, 67, 70, 71, 77, 82, 88, 92, 94, 95, 96, 101, 102, 107], "can": [0, 2, 3, 4, 5, 6, 7, 23, 46, 58, 60, 65, 66, 67, 76, 77, 85, 87, 91, 92, 100, 101, 104, 105, 107], "cannot": [60, 77], "captur": [65, 66, 95, 97], "care": 107, "carefulli": 77, "carri": 77, "cascad": 5, "case": [2, 3, 8, 10, 11, 16, 17, 24, 38, 45, 46, 47, 50, 52, 53, 54, 61, 63, 65, 66, 67, 73, 77, 87, 92, 95, 97, 103, 107], "cast": [63, 77], "cat": 77, "cauchi": 77, "cauchy_": 77, "caus": [4, 5, 77, 107], "causal": [12, 19, 60], "causalbia": 60, "ccol_indic": 77, "cdot": 3, "cdoubl": 77, "ceil": 77, "ceil_": 77, "center": 77, "certain": [2, 10, 11, 51, 54, 59, 65, 66, 67, 87, 101], "cfloat": 77, "chain": [5, 77, 91], "chalf": 77, "chang": [3, 4, 5, 7, 8, 12, 16, 17, 23, 47, 50, 60, 61, 62, 63, 65, 66, 67, 77, 85, 87, 107], "channel": [2, 6, 9, 77], "channels_last": 77, "char": 77, "charact": 77, "characterist": [65, 66, 67], "check": [71, 76, 77, 78, 100, 107], "checker": 76, "choleski": 77, "cholesky_invers": 77, "cholesky_solv": 77, "choos": 100, "chosen": [6, 30, 31, 32, 33, 60], "chrome": 77, "chunk": [21, 77], "circular": 2, "circumst": [2, 46, 60], "cl": 77, "clamp": 77, "clamp_": 77, "class": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 40, 41, 47, 49, 60, 63, 64, 65, 66, 67, 72, 73, 75, 76, 77, 83, 84, 92, 97, 98, 99, 100, 101, 102, 107], "classif": 3, "classmethod": 7, "clear": [73, 101], "clearer": 23, "clearli": 107, "click": 0, "clip": [77, 107], "clip_": 77, "clone": [77, 100, 104, 107], "close": 0, "closur": [65, 66, 67], "co": [6, 77], "coalesc": 77, "code": [0, 5, 24, 77, 102, 103, 106, 107], "codebas": [104, 107], "coeffici": [65, 66], "coexist": 77, "coin": [16, 61], "col_indic": 77, "colour": [23, 24, 103], "column": [49, 77], "com": [77, 104, 107], "combin": [24, 52, 53, 54, 56, 57, 77, 85, 103, 107], "come": [56, 77, 85], "common": [1, 25, 44, 45, 65, 66, 67, 105, 107], "commun": 77, "compar": [92, 104], "compat": [77, 101], "compil": [92, 94, 95, 104, 105, 107], "complet": [77, 88, 105], "complex": [2, 45, 46, 77, 101], "complex128": [2, 46, 77], "complex32": [2, 46, 77], "complex64": [2, 46, 77], "compon": 34, "compos": [46, 85], "comprehens": 107, "compress": 77, "compromis": 107, "comput": [3, 6, 7, 9, 17, 26, 28, 29, 37, 39, 47, 49, 55, 60, 63, 65, 66, 77, 82, 100, 107], "compute_uv": 77, "concaten": 2, "concern": 107, "concret": [77, 101], "conda": 102, "condit": 77, "configur": [4, 5], "conj": 77, "conj_phys": 77, "conj_physical_": 77, "conjug": 77, "conjunct": [56, 58, 91, 95, 107], "connect": [2, 5, 19, 56, 57, 58, 92, 107], "consecut": 77, "consid": [3, 77, 107], "consider": 104, "constant": 36, "constrain": [26, 28, 29, 92, 107], "constraint": [2, 8, 10, 11, 16, 17, 19, 38, 45, 46, 50, 52, 53, 54, 60, 61, 63, 67, 101, 104, 107], "constraint_nam": 27, "construct": [4, 5, 7, 14, 49, 74, 77], "constructor": 5, "consum": 77, "contain": [3, 4, 5, 7, 22, 47, 49, 65, 66, 67, 77, 84, 86, 89, 90, 91, 96, 99, 100, 107], "content": [65, 66, 67, 100], "context": [60, 65, 66, 67, 77, 100], "contigu": 77, "contiguous_format": 77, "continu": 77, "contrast": 107, "contribut": [3, 7, 39, 47, 49, 56, 57, 58, 82], "control": [2, 18, 60, 64, 107], "conv": 2, "conv1": 5, "conv1d": 104, "conv2": 5, "conv2d": 5, "conveni": [24, 94, 103], "convent": [77, 101], "converg": [65, 66], "convers": [77, 107], "convert": [22, 42, 77, 97, 107], "convolut": [2, 46], "convolv": [2, 46], "coo": 77, "coordin": 77, "copi": [65, 66, 67, 73, 77, 92], "copy_": 77, "copysign": 77, "copysign_": 77, "core": [18, 104], "corrcoef": 77, "correct": [77, 89, 90, 100, 107], "correctli": 77, "correl": 2, "correspond": [3, 7, 8, 12, 16, 17, 24, 27, 43, 47, 49, 50, 60, 61, 62, 63, 65, 66, 67, 73, 77, 92, 100, 103], "cos_": 77, "cosh": 77, "cosh_": 77, "cost": [2, 46, 60, 77], "could": [77, 107], "count": 77, "count_nonzero": 77, "counterintuit": 77, "cov": 77, "cover": 107, "coverag": 105, "cpu": 77, "cpython": 102, "creat": [5, 7, 71, 73, 77], "create_graph": 77, "creation": 77, "criterion": 3, "cross": [2, 3, 47, 77, 107], "cross_entropi": [104, 107], "crossentropyloss": [47, 104, 107], "crow_indic": 77, "csc": 77, "csr": 77, "ctx": 100, "cuda": [2, 46, 60, 65, 66, 67, 77], "cuda0": 77, "cudacachingalloc": 77, "cudnn": [2, 46, 60], "cummax": 77, "cummin": 77, "cumprod": 77, "cumprod_": 77, "cumsum": 77, "cumsum_": 77, "cumul": [8, 50], "current": [18, 24, 60, 65, 66, 67, 77, 85, 92, 94, 101, 103, 104, 105, 106, 107], "custom": [77, 92, 100], "cycl": 77, "d": [9, 52, 53, 73, 77, 100, 102, 107], "d_1": [3, 47], "d_2": [3, 47], "d_k": [3, 47], "dampen": 67, "data": [2, 9, 10, 11, 14, 21, 46, 52, 53, 63, 74, 77, 84, 101], "data_ptr": 77, "dataclass": 99, "datafram": 22, "dataload": 77, "dataset": [21, 24, 103], "dataset_nam": [21, 24, 103], "dataset_path": [21, 24, 103], "dealloc": [77, 101], "decai": [65, 66, 67, 71], "decid": 77, "decim": 77, "decod": 18, "decompos": 107, "decor": [76, 100], "decoupl": 66, "deem": [23, 24, 87, 103], "deep": [67, 107], "def": [60, 76, 77, 92, 100, 102, 107], "default": [2, 3, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 21, 23, 24, 38, 39, 45, 46, 47, 48, 49, 50, 52, 53, 54, 56, 57, 58, 60, 61, 63, 65, 66, 67, 71, 73, 77, 82, 87, 92, 94, 95, 96, 97, 100, 102, 103, 107], "defin": [17, 27, 60, 63, 64, 65, 66, 67, 76, 77, 92, 100, 107], "definit": [92, 104, 106], "deg2rad": 77, "deg2rad_": 77, "degrad": 107, "del": 77, "delai": [58, 77, 107], "demonstr": [18, 104], "denomin": [9, 15, 65, 66], "denot": [2, 23, 77, 107], "dens": 77, "dense_dim": 77, "densiti": 77, "depend": [3, 23, 47, 54, 60, 71, 77, 92, 101], "deprec": [3, 47, 77], "depth": [4, 5, 39, 68, 82], "depthmodulelist": 104, "depthsequenti": 104, "depthwis": 2, "dequant": 77, "deriv": [77, 107], "descend": [77, 101], "describ": [2, 3, 6, 9, 15, 39, 47, 77, 82, 106], "design": [104, 107], "desir": [63, 77, 107], "despit": 105, "destin": 77, "det": 77, "detach": 77, "detach_": 77, "detail": [2, 7, 8, 10, 11, 16, 17, 38, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 61, 63, 65, 66, 76, 77, 92, 100, 101, 105, 107], "detector": 6, "determin": [21, 54, 107], "determinist": [2, 7, 46, 49, 60, 77], "develop": [98, 102, 107], "deviat": [9, 77, 99, 100, 101, 102, 107], "devic": [2, 7, 9, 10, 11, 46, 52, 53, 54, 60, 77, 101], "dfrac": 77, "diag": 77, "diag_emb": 77, "diagflat": 77, "diagon": [60, 77], "diagonal_scatt": 77, "dict": [65, 66, 67, 71, 73, 77, 92, 96, 101], "dictionari": [7, 65, 66, 67, 73, 77, 92], "dictionaryand": 49, "did": [65, 66, 67], "diff": 77, "differ": [2, 5, 10, 11, 24, 49, 54, 58, 60, 65, 66, 67, 77, 79, 85, 94, 103, 107], "differenti": [41, 65, 66, 67, 77, 100], "difficulti": 105, "digamma": 77, "digamma_": 77, "dilat": [2, 46], "dim": [3, 17, 37, 47, 60, 63, 77], "dim0": 77, "dim1": 77, "dim2": 77, "dim_ord": 77, "dimens": [3, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 46, 51, 52, 53, 54, 59, 63, 77, 100], "dimension": [3, 7, 9, 17, 47, 54, 77], "directli": [77, 100, 105, 107], "disabl": [60, 71], "discret": 77, "discuss": 107, "displai": 23, "dist": 77, "distribut": [3, 6, 8, 47, 48, 50, 77, 105, 107], "div": 77, "div_": 77, "divid": [47, 77], "divide_": 77, "divis": [2, 46, 77, 107], "divisor": 77, "do": [3, 6, 7, 48, 49, 65, 66, 67, 77, 100, 101, 106, 107], "doc": [0, 105], "document": [0, 7, 17, 107], "doe": [2, 3, 4, 5, 7, 47, 54, 60, 65, 66, 67, 71, 73, 77, 85, 94, 107], "doesn": [2, 46, 85, 92, 94, 95, 97, 100], "don": [23, 24, 65, 66, 67, 86, 87, 103, 107], "done": [77, 92], "dot": [3, 54, 60, 77], "doubl": [77, 100], "down": 107, "downcast": 77, "downstream": 64, "draw": 77, "drawn": 77, "drift": 107, "drop": [77, 107], "dropout": [12, 18, 19, 60, 104], "dropout_p": [12, 18, 19, 60], "dsplit": 77, "dtype": [2, 3, 7, 9, 10, 11, 47, 52, 53, 54, 60, 63, 77, 100, 101], "dual_level": 100, "duck": 76, "due": [60, 65, 66, 67, 77], "dummi": 102, "duplic": 77, "dure": [6, 7, 23, 24, 49, 60, 77, 100, 103, 107], "dw": 46, "dynamo": 83, "dynload": 102, "e": [2, 3, 7, 9, 46, 49, 54, 60, 73, 77, 85, 86, 87, 91, 100], "e4": [90, 107], "e5": [90, 107], "each": [2, 3, 5, 6, 7, 9, 10, 11, 23, 24, 46, 47, 49, 52, 60, 65, 66, 67, 77, 89, 90, 91, 100, 101, 102, 103, 107], "eager": 107, "earli": 77, "easi": [92, 94], "easier": 107, "easili": 0, "effect": [6, 77, 89, 90, 107], "effici": [60, 77, 100, 107], "effort": 105, "einop": 102, "either": [2, 3, 47, 73, 77, 100], "element": [2, 3, 6, 9, 15, 17, 38, 46, 47, 48, 50, 55, 60, 63, 73, 77], "element_s": 77, "elementwise_affin": [9, 15], "elimin": [77, 107], "ell": 3, "ellipsi": 77, "els": [60, 65, 66, 67, 73, 77], "embed": [9, 18, 39, 82, 104], "embedding_dim": [7, 9, 49], "embedding_matrix": 49, "emploi": 107, "empti": [3, 77], "enabl": [60, 67, 71, 77, 79, 92, 94, 100, 107], "enable_flash_sdp": 60, "enable_gqa": 60, "enable_grad": 77, "enable_io_process": 101, "enable_math_sdp": 60, "enable_mem_efficient_sdp": 60, "encount": [104, 107], "encourag": 105, "end": [3, 4, 5, 18, 47, 65, 66, 67, 73, 77, 107], "end_dim": 77, "engin": [77, 100], "enough": 107, "ensur": [39, 60, 77, 82, 100, 101, 107], "entir": [9, 77, 107], "entri": [7, 49, 65, 66, 67, 77], "entropi": [3, 47, 107], "env": 100, "environ": 101, "ep": [9, 15, 37, 51, 59, 65, 66], "epsilon": [9, 15, 37, 65, 66], "eq": 77, "eq_": 77, "equal": [30, 31, 32, 33, 39, 49, 56, 57, 58, 77, 82, 107], "equal_nan": 77, "equat": 106, "equival": [0, 2, 3, 7, 9, 60, 77, 89, 90, 92, 100, 102, 107], "erf": 77, "erf_": 77, "erfc": 77, "erfc_": 77, "erfinv": 77, "erfinv_": 77, "error": [8, 16, 23, 24, 50, 55, 60, 61, 77, 100, 101, 103], "especi": 77, "estim": [8, 9, 50], "et": 90, "etc": 3, "ev": 60, "evalu": [6, 9, 60, 101, 107], "even": [46, 54, 77, 107], "evenli": 77, "event": [60, 77], "everi": [6, 17, 39, 77, 82, 100, 101], "ex": [65, 66, 67], "exact": [77, 107], "exactli": [5, 107], "exampl": [2, 3, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 36, 45, 46, 47, 49, 54, 60, 65, 66, 67, 71, 76, 77, 100, 101, 102, 105, 107], "example_batch": [24, 103, 104], "except": 77, "execut": [77, 85, 94, 101], "exhaust": 105, "exist": [0, 73, 77, 92, 107], "exp": [3, 17, 63, 77], "exp2": 77, "exp2_": 77, "exp_": 77, "expand": [77, 100], "expand_a": 77, "expansion_factor": 13, "expect": [3, 9, 15, 49, 77, 89, 90, 107], "experi": [16, 61], "experiment": [24, 60, 77, 89, 90, 92, 103], "explicit": 77, "explicitli": 92, "explod": 107, "expm1": 77, "expm1_": 77, "expon": 77, "exponent_bit": [41, 42, 43], "exponenti": 77, "exponential_": 77, "export_chrome_trac": 77, "express": [77, 92, 96], "extend": [4, 72, 100], "extra": [75, 107], "extract": [7, 49, 101], "ey": 77, "f": [0, 38, 46, 47, 49, 60, 65, 66, 67, 73, 77, 107], "f_t": [65, 66, 67], "facilit": 104, "fact": [77, 107], "factor": [6, 13, 30, 31, 32, 33, 58, 60, 67, 68, 69, 70, 71, 79, 80, 81, 92, 107], "fail": [71, 77, 107], "faint": [23, 24, 103], "fals": [2, 3, 6, 7, 9, 10, 11, 15, 16, 23, 37, 47, 48, 49, 60, 61, 65, 66, 67, 71, 73, 77, 100, 102], "fan_in": 107, "fan_out": 107, "far": 107, "fashion": 77, "faster": 60, "favor": [56, 57, 58], "fc1": 102, "fc1_bia": 102, "fc1_weight": 102, "fc2": 102, "fc2_bia": 102, "fc2_weight": 102, "featur": [6, 52, 53, 54, 60, 104, 105, 107], "fed": [102, 107], "fetch": 101, "fetch_args_kwargs_from_env": 101, "fetch_attr": 101, "fewer": [65, 66, 67], "ffn": [39, 82], "field": [3, 47, 75, 77], "fifo": 73, "fill": 77, "fill_": 77, "fill_diagonal_": 77, "fill_uninitialized_memori": 77, "fill_valu": 77, "filter": [2, 46, 77], "final": [5, 11, 53, 54, 91, 92], "find": 107, "fine": [60, 65, 66, 67, 77], "finer": 64, "finish": 77, "fire": [65, 66, 67], "first": [0, 5, 7, 38, 54, 65, 66, 77, 85, 94, 100, 101, 107], "firstli": 107, "fit": 77, "five": 92, "fix": [7, 49, 77, 107], "fix_": 77, "flag": [65, 66, 67], "flash": 60, "flash_attent": 60, "flashattent": 60, "flatten": 77, "flatter": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "flexibl": 107, "flip": 77, "fliplr": 77, "flipud": 77, "float": [3, 6, 7, 8, 9, 12, 15, 16, 17, 18, 19, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 36, 37, 38, 39, 41, 45, 46, 47, 48, 49, 50, 51, 52, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 77, 80, 81, 82, 84, 86, 87, 91, 99, 101, 103, 107], "float16": [2, 10, 11, 54, 60, 65, 66, 67, 77], "float32": [65, 66, 67, 77], "float64": [60, 65, 66, 67, 77], "float_pow": 77, "float_power_": 77, "floattensor": [7, 77], "floor": 77, "floor_": 77, "floor_divid": 77, "floor_divide_": 77, "fmax": 77, "fmin": 77, "fmod": 77, "fmod_": 77, "fn": [39, 57, 82, 95], "focuss": 105, "follow": [0, 3, 24, 38, 54, 60, 65, 66, 67, 73, 77, 85, 91, 95, 100, 103, 104, 107], "footprint": [65, 66, 67], "foral": 77, "forc": 77, "foreach": [65, 66, 67], "fork": [104, 107], "form": [60, 107], "format": [77, 89, 90, 104, 107], "format_to_tupl": 104, "formul": 60, "formula": [67, 100], "fortun": 107, "forward": [5, 6, 22, 23, 24, 41, 60, 77, 79, 80, 81, 84, 85, 89, 90, 91, 92, 94, 95, 99, 100, 101, 102, 103, 107], "found": [73, 104, 107], "fp16": 107, "fp32": [89, 90, 107], "fp8": [90, 107], "fpformat": [42, 43, 89, 104], "frac": [2, 3, 6, 9, 10, 11, 15, 17, 46, 60, 63, 77], "frac_": 77, "free": 105, "freed": 77, "freez": 7, "frequenc": [7, 49], "frexp": 77, "fro": 77, "from": [0, 2, 4, 6, 7, 9, 10, 11, 17, 21, 23, 24, 48, 49, 65, 66, 67, 73, 77, 85, 86, 87, 89, 90, 91, 92, 94, 100, 101, 102, 103, 107], "from_numpi": 77, "from_pretrain": 7, "fromkei": 73, "front": 77, "frozen": [65, 66, 67], "fsdp": 77, "full": [0, 21, 46, 65, 66, 67, 77, 91, 100, 107], "fulli": 101, "func": [76, 100], "function": [0, 1, 2, 3, 5, 6, 8, 10, 11, 12, 16, 17, 18, 20, 22, 23, 25, 27, 34, 40, 64, 65, 66, 67, 72, 77, 79, 82, 83, 85, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 100, 101, 102, 104, 105, 107], "functionctx": 100, "fundament": 92, "further": [65, 66, 77, 107], "furthermor": [6, 77], "fuse": [60, 65, 66, 67, 107], "fusion": 107, "futur": [24, 77, 85, 100, 103], "fwad": 100, "fwd": [84, 107], "fwd_format": 89, "fwd_scale": 107, "fwd_tensor": 84, "fweight": 77, "fx": [22, 23, 85, 86, 87, 88, 91, 94, 95, 97, 101, 102, 107], "g": [22, 23, 39, 62, 65, 66, 77, 82, 86, 87, 91, 100], "g1": 100, "g2": 100, "g_": 67, "g_t": [65, 66, 67], "gamma": [9, 15, 65, 66, 67], "gate": [16, 61, 62], "gather": 77, "gaussian": [8, 16, 50, 61], "gc": 77, "gcd": 77, "gcd_": 77, "ge": 77, "ge_": 77, "gelu": [16, 61, 92, 104, 107], "gener": [3, 21, 22, 23, 24, 41, 65, 66, 67, 76, 77, 86, 87, 92, 102, 103, 107], "generate_vmap_rul": 100, "genproto": 76, "geometr": [28, 77, 107], "geometric_": 77, "geq": [3, 47], "geqrf": 77, "ger": 77, "get": [7, 71, 73, 77, 105, 107], "get_attr": 101, "get_devic": 77, "get_info": 77, "get_swap_module_params_on_convers": 77, "git": [104, 107], "github": [77, 104, 107], "give": [2, 19, 77, 92, 104, 107], "given": [2, 3, 4, 5, 7, 18, 21, 39, 41, 43, 46, 47, 49, 60, 64, 73, 76, 77, 82, 86, 87, 88, 89, 90, 96, 100, 102, 107], "global": [60, 71, 85], "glorot": [29, 107], "gmean": [2, 8, 10, 11, 16, 17, 38, 45, 46, 50, 52, 53, 54, 61, 63, 104, 107], "gnu": 102, "go": [77, 100, 107], "good": 107, "got": 100, "gpu": 77, "gqa": 60, "grad": [32, 38, 46, 52, 65, 66, 67, 77, 80, 81, 100, 107], "grad_bias_scal": 107, "grad_fn": 77, "grad_input": 100, "grad_input_scal": [30, 32, 38, 107], "grad_out": 100, "grad_output": 100, "grad_weight_scal": 107, "gradient": [2, 3, 7, 8, 10, 11, 16, 17, 19, 30, 31, 32, 33, 38, 45, 46, 47, 49, 50, 52, 53, 54, 58, 61, 63, 65, 66, 67, 77, 84, 89, 92, 100, 107], "grain": 60, "graph": [22, 23, 24, 65, 66, 77, 86, 87, 88, 91, 92, 94, 95, 96, 97, 100, 101, 103], "graph_modul": 94, "graph_to_datafram": 104, "graphcor": [104, 107], "graphmodul": [85, 94, 101], "great": 107, "greater": [60, 77], "greater_": 77, "greater_equ": 77, "greater_equal_": 77, "greedili": 77, "ground": [3, 47], "group": [2, 26, 27, 28, 29, 46, 60, 64, 65, 66, 67, 71, 77], "grow": 107, "gt": 77, "gt_": 77, "guarante": [65, 66, 67, 77, 101, 105], "guid": [92, 104], "guidanc": 77, "gx": 100, "gy": 100, "gz": 100, "h": [7, 9, 60, 77], "h_": [10, 11], "ha": [2, 3, 6, 9, 15, 27, 46, 47, 54, 60, 65, 66, 67, 73, 77, 91, 92, 100, 106, 107], "half": [2, 60, 77], "hand": 5, "handl": [65, 66, 67, 77, 100, 101], "happen": [77, 100], "harder": 2, "hardshrink": 77, "harmon": 29, "has_nam": 77, "have": [2, 3, 7, 22, 23, 49, 52, 53, 54, 58, 65, 66, 67, 77, 86, 87, 89, 90, 91, 92, 100, 104, 105, 107], "haven": 105, "head": [12, 18, 19], "headroom": 107, "heavisid": 77, "heaviside_": 77, "height": 77, "help": [104, 105, 107], "henc": [77, 92], "here": [19, 60, 77, 92, 94, 105, 107], "hi": 107, "hidden": [12, 13, 18, 19], "hidden_s": [12, 13, 18, 19, 102], "hierarchi": 101, "high": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63, 77], "higher": [3, 60, 77], "histc": 77, "histogram": 77, "hmean": [2, 8, 10, 11, 16, 17, 38, 45, 46, 50, 52, 53, 54, 61, 63, 104], "hold": [65, 66, 67, 77], "home": 102, "hook": [65, 66, 67, 77, 100], "hop_length": 77, "horizont": [23, 24, 103], "host": 77, "how": [77, 100, 104], "howev": [2, 46, 65, 66, 67, 77, 92, 94, 107], "hq": 60, "hspace": [65, 66, 67], "hsplit": 77, "http": [19, 77, 104, 107], "huggingfac": [21, 24, 103], "hybrid": 77, "hyperparamet": [39, 82, 107], "hypot": 77, "hypot_": 77, "i": [0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 21, 22, 23, 24, 27, 32, 39, 46, 47, 48, 49, 50, 52, 53, 54, 58, 60, 61, 62, 63, 65, 66, 67, 73, 74, 77, 82, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 100, 101, 103, 104, 105], "i0": 77, "i0_": 77, "icml": 104, "id": [21, 65, 66, 67], "ideal": 107, "ident": [6, 107], "identif": 92, "identifi": [60, 77, 86, 92], "identit": 77, "idx": [77, 100], "igamma": 77, "igamma_": 77, "igammac": 77, "igammac_": 77, "ignor": [3, 47, 76, 77, 107], "ignore_index": [3, 47], "ignore_w": [7, 49], "imag": [2, 3, 9, 77], "imaginari": 77, "img": 77, "immedi": 77, "impair": [65, 66, 67], "implement": [0, 6, 9, 12, 13, 15, 17, 18, 19, 24, 35, 60, 65, 66, 67, 77, 85, 92, 94, 102, 103, 104, 106, 107], "impli": 77, "implicit": [2, 46], "implicitli": 75, "import": [0, 23, 67, 85, 86, 87, 91, 92, 107], "improv": [6, 60, 65, 66, 67], "in_channel": 2, "in_dim": 100, "in_featur": [10, 11], "incept": [3, 47], "includ": [10, 11, 52, 53, 77, 105], "include_self": 77, "inclus": 77, "incom": [2, 10, 11, 52, 53], "incorrect": [19, 77, 100], "increas": [2, 13, 46, 60, 100], "incur": 107, "indend": 22, "independ": [6, 71, 77, 92], "independent_weight_decai": [65, 66, 67, 71], "index": [3, 4, 7, 39, 49, 77, 82, 100], "index_add": 77, "index_add_": [77, 100], "index_copi": 77, "index_copy_": 77, "index_fil": 77, "index_fill_": 77, "index_put": 77, "index_put_": 77, "index_reduce_": 77, "index_select": 77, "indic": [3, 7, 23, 24, 47, 49, 60, 65, 66, 67, 77, 100, 103], "indices_or_sect": 77, "individu": 77, "induc": 100, "inf": [17, 60, 77], "infer": [18, 77], "info": 100, "inform": [2, 22, 46, 60, 77, 84, 87, 91], "inherit": 0, "initi": [4, 5, 7, 9, 10, 11, 15, 39, 49, 65, 66, 67, 77, 82, 107], "initial_env": 101, "initialis": [92, 107], "inner": [77, 92], "innermost": 77, "inplac": [6, 16, 48, 61, 65, 66, 67, 100, 102], "inpt": [23, 86, 87, 91], "input": [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 22, 23, 24, 30, 31, 32, 33, 38, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 77, 80, 81, 87, 90, 91, 92, 100, 101, 102, 103, 107], "input2": 77, "input3": 77, "input_idx": 21, "input_tensor": 77, "insert": [4, 73, 89, 90], "insid": 100, "inspect": [89, 90], "instanc": [7, 9, 65, 66, 67, 77, 91], "instead": [3, 47, 65, 66, 67, 77], "instruct": [104, 107], "instrument": 101, "int": [2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21, 24, 37, 39, 41, 42, 43, 45, 46, 47, 49, 51, 59, 63, 65, 66, 67, 74, 76, 77, 82, 84, 100, 101, 103, 107], "int16": 77, "int32": 77, "int64": [47, 77], "int8": 77, "int_repr": 77, "integ": [2, 9, 45, 77, 100], "integr": 77, "intend": [23, 65, 66, 67, 87, 91, 99, 100], "interfac": [24, 92, 94, 103], "intermedi": [13, 60, 65, 66, 67, 91, 101], "intermediari": 100, "intern": [22, 46, 89, 90, 91], "interoper": [92, 94], "interpol": [36, 77], "interpret": 101, "interv": 77, "introduc": [85, 107], "introduct": 107, "inttensor": [7, 77], "invalid": 77, "invers": [7, 49, 77], "investig": 107, "invoc": 101, "involv": 107, "ipu": 77, "is_caus": [12, 19, 60], "is_coalesc": 77, "is_complex": 77, "is_conj": 77, "is_contigu": 77, "is_cpu": 77, "is_cuda": 77, "is_floating_point": 77, "is_infer": 77, "is_ipu": 77, "is_leaf": 77, "is_meta": 77, "is_mp": 77, "is_neg": 77, "is_pin": 77, "is_quant": 77, "is_set_to": 77, "is_shar": 77, "is_sign": 77, "is_spars": 77, "is_sparse_csr": 77, "is_xla": 77, "is_xpu": 77, "isclos": 77, "isfinit": 77, "isinf": 77, "isn": 77, "isnan": 77, "isneginf": 77, "isposinf": 77, "isreal": 77, "issu": [105, 107], "issuecom": 77, "istft": 77, "item": [3, 73, 77], "items": 77, "iter": [4, 65, 66, 67, 71, 73, 77, 88, 94], "its": [2, 38, 54, 65, 66, 77, 81, 84, 85, 96, 100, 104, 107], "itself": [65, 66, 67, 77], "iw": 46, "j": [54, 77], "jit": 107, "jump": 77, "just": [18, 21, 65, 66, 67, 77, 100, 101, 107], "jvp": 100, "k": [2, 3, 10, 11, 47, 54, 73, 77], "keen": [104, 105, 107], "keep": [77, 107], "keep_type_expr": 96, "keepdim": [37, 77], "kei": [18, 60, 73, 77, 84, 91, 92, 104], "kept": 60, "kernel": [2, 46, 60, 77, 107], "kernel_s": 2, "keyerror": 73, "keyword": [24, 60, 77, 101, 103], "know": [77, 100], "known": [2, 16, 61, 105], "kron": 77, "kthvalu": 77, "kw": 46, "kwarg": [38, 65, 66, 67, 75, 77, 96, 100, 101], "l": [2, 3, 60], "l2": [65, 67], "l_": 2, "l_1": 3, "l_n": 3, "label": [3, 21, 24, 103], "label_smooth": [3, 47], "lack": [18, 73], "laid": 77, "lambd": 77, "lambda": [65, 66, 67, 77], "languag": 104, "larg": 107, "larger": [7, 49, 56, 57, 58, 77, 92, 107], "largest": 77, "last": [3, 5, 9, 10, 11, 51, 59, 73, 77, 85], "late": 77, "later": [16, 61, 77], "latest": 107, "layer": [2, 5, 9, 10, 11, 12, 13, 15, 18, 19, 39, 51, 58, 65, 66, 67, 82, 85, 92, 107], "layer_norm": [9, 104], "layernorm": [51, 104], "layout": [52, 53, 54, 77, 101], "lcm": 77, "lcm_": 77, "ldexp": 77, "ldexp_": 77, "ldot": [9, 65, 66, 67, 77], "le": 77, "le_": 77, "lead": 100, "leaf": 77, "leak": 100, "learn": [7, 9, 10, 11, 16, 61, 65, 66, 67, 71, 107], "learnabl": [2, 7, 9, 10, 11, 15], "least": [54, 77], "leav": [65, 66, 67, 77, 107], "left": [2, 31, 33, 54, 60, 77, 107], "left_grad_scal": [31, 33], "leftarrow": [65, 66, 67], "legaci": 106, "len": 77, "length": [2, 21, 24, 46, 77, 103], "lerp": 77, "lerp_": 77, "less": 77, "less_": 77, "less_equ": 77, "less_equal_": 77, "let": 77, "letter": 0, "level": [77, 107], "lfloor": 2, "lgamma": 77, "lgamma_": 77, "li": 17, "lib": 102, "librari": [34, 91, 92, 104, 105, 107], "lie": [17, 63], "life": 77, "lifetim": 77, "lifo": 73, "lift": 77, "like": [4, 5, 19, 65, 66, 67, 73, 77, 100, 107], "limit": [36, 60, 91, 104], "linalg": 77, "line": [23, 24, 103, 107], "linear": [8, 11, 16, 50, 53, 61, 62, 77, 89, 90, 102, 104, 107], "linear_1": [102, 107], "linear_1_bia": 107, "linear_1_weight": 107, "linear_2": 107, "linear_2_bia": 107, "linear_2_weight": 107, "linear_readout": 104, "linearreadout": [67, 104], "link": 2, "linux": 102, "list": [4, 5, 7, 9, 49, 65, 66, 67, 77, 94, 101, 105], "liter": [2, 10, 11, 14, 74, 77], "literatur": 90, "live": 77, "llama3": 60, "ln": 77, "load": [65, 66, 67, 77], "load_state_dict": [65, 66, 67, 77], "loaded_weight": 77, "local": [18, 102], "locat": 77, "log": [3, 77, 107], "log10": 77, "log10_": 77, "log1p": 77, "log1p_": 77, "log2": 77, "log2_": 77, "log_": 77, "log_normal_": 77, "logaddexp": 77, "logaddexp2": 77, "logarithm": 36, "logarithmic_interpol": 104, "logcumsumexp": 77, "logdet": 77, "logic": 54, "logical_and": 77, "logical_and_": 77, "logical_not": [60, 77], "logical_not_": 77, "logical_or": 77, "logical_or_": 77, "logical_xor": 77, "logical_xor_": 77, "logist": [16, 61, 62], "logit": [3, 47, 77], "logit_": 77, "logsoftmax": 3, "logsumexp": 77, "long": [3, 77], "longer": [0, 77], "longtensor": [7, 49, 77], "look": [7, 49, 54, 65, 66, 67, 77, 101, 107], "lookup": [7, 49], "loop": [65, 66, 67], "lose": 77, "loss": [3, 23, 24, 47, 65, 66, 67, 86, 87, 91, 103, 107], "loss_fn": 67, "low": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63, 77, 104, 107], "lower": [36, 46, 60, 65, 66, 67], "lr": [64, 65, 66, 67, 68, 69, 70, 71, 77], "lr_scale_for_depth": 104, "lr_scale_func": 71, "lr_scale_func_adam": 104, "lr_scale_func_sgd": [71, 104], "lt": 77, "lt_": 77, "lu": 77, "lu_data": 77, "lu_pivot": 77, "lu_solv": 77, "m": [2, 6, 8, 10, 11, 16, 17, 54, 85, 89, 90, 91, 92, 94], "m_": [65, 66], "m_0": [65, 66], "m_t": [65, 66], "made": [65, 66, 67, 100], "mai": [0, 2, 3, 24, 46, 52, 53, 54, 58, 60, 65, 66, 67, 77, 85, 87, 91, 92, 100, 103, 104, 105, 107], "main": 77, "maintain": 57, "make": [2, 23, 46, 60, 77, 92, 94, 97, 107], "make_du": 100, "manag": [60, 77], "mani": [77, 92, 100, 107], "manner": 77, "mantissa": 77, "mantissa_bit": [41, 42, 43], "manual": [3, 5, 47, 65, 66, 67, 77, 85, 92, 107], "map": [65, 66, 67, 77, 101], "map_": 77, "map_nodes_to_valu": 101, "mark": [77, 100], "mark_dirti": 100, "mark_non_differenti": 100, "mask": [12, 18, 19, 21, 60, 77], "masked_fil": 77, "masked_fill_": [60, 77], "masked_grad": 77, "masked_scatt": 77, "masked_scatter_": 77, "masked_select": 77, "mat": 77, "mat1": 77, "mat2": 77, "match": [60, 65, 66, 67, 77], "materi": 100, "math": [60, 102], "mathbb": 3, "mathcal": [2, 7, 10, 11], "mathrm": [9, 37, 65, 66], "matmul": [77, 89, 90, 104], "matplotlib": [23, 24, 103], "matric": 77, "matrix": [7, 49, 54, 60, 77], "matrix_exp": 77, "matrix_pow": 77, "matter": 100, "max": [23, 65, 66, 77, 107], "max_absolute_valu": 41, "max_norm": [7, 49], "maxim": [65, 66, 67, 104], "maximum": [23, 24, 41, 49, 77, 103], "maxnorm": 77, "mean": [3, 6, 8, 9, 10, 11, 15, 16, 17, 23, 26, 28, 29, 37, 47, 52, 53, 55, 77, 85, 107], "mean_ab": [23, 84], "meantim": [3, 47], "measur": [55, 107], "mechan": [60, 107], "median": 77, "meet": 107, "memori": [60, 65, 66, 67, 77, 100, 107], "memory_format": [77, 101], "mention": 77, "merit": 107, "met": 77, "meta": [77, 91], "metadata": [65, 66, 67, 75], "meth": 76, "method": [4, 5, 24, 65, 71, 73, 77, 85, 86, 87, 89, 90, 91, 94, 100, 101, 102, 103, 104, 107], "metric": [20, 22, 23, 91, 104], "mh": 77, "mhsa": 104, "mhsa_tau": 19, "micikeviciu": 90, "might": [65, 66, 67, 77, 107], "min": [23, 77, 107], "min_absolute_norm": 41, "min_absolute_subnorm": 41, "mind": 76, "mini": [7, 9, 49], "minibatch": [3, 46, 47], "minim": [65, 66, 67], "minimum": [23, 24, 41, 77, 103], "minlength": 77, "minu": 77, "mirror": [0, 77], "miss": [52, 53, 54, 105], "mixtur": [3, 47], "mkl": 77, "mkldnn": 77, "mlp": [19, 39, 82, 102, 104, 107], "mlp_tau": 19, "mm": [54, 77], "mod": 97, "mode": [2, 9, 46, 60, 77, 100], "model": [5, 20, 23, 24, 58, 65, 66, 67, 71, 86, 87, 89, 90, 91, 92, 98, 103, 104], "modestli": [65, 66, 67], "modif": [0, 4, 5, 77, 100], "modifi": [49, 65, 66, 67, 77, 100], "modul": [1, 2, 4, 5, 6, 7, 9, 10, 11, 15, 18, 22, 23, 24, 27, 34, 49, 54, 60, 77, 83, 85, 86, 87, 89, 90, 91, 92, 94, 95, 97, 101, 102, 103, 107], "module_load": 77, "modulelist": [4, 5], "moduletyp": 102, "moment": [65, 66], "momentum": 67, "momentum_buff": [65, 66, 67], "more": [2, 7, 46, 49, 60, 61, 63, 65, 66, 67, 77, 91, 100, 105, 107], "more_img": 77, "moreov": 77, "most": [60, 77, 100, 105, 107], "motiv": 107, "move": [73, 77, 85, 94, 106], "move_to_end": 73, "moveaxi": 77, "movedim": 77, "mp": 77, "mse_loss": 104, "mseloss": 55, "msort": 77, "mt": 77, "mtia": 77, "mu": [67, 77], "much": [77, 92, 107], "mul": 77, "mul_": 77, "mult": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "multi": [12, 19], "multinomi": 77, "multipl": [3, 47, 54, 77, 80, 81, 87, 107], "multipli": [2, 3, 8, 12, 16, 17, 47, 50, 54, 60, 61, 62, 63, 77, 107], "multiply_": 77, "mup": [64, 65, 66, 67, 71], "mup_scaling_depth": [14, 71, 74], "mup_typ": [14, 71, 74], "must": [2, 8, 10, 11, 16, 17, 27, 36, 38, 45, 46, 50, 52, 53, 54, 60, 61, 63, 77, 85, 86, 87, 100, 107], "mv": 77, "mvlgamma": 77, "mvlgamma_": 77, "mymodel": 60, "n": [2, 3, 7, 9, 17, 47, 54, 60, 77, 101], "n_fft": 77, "n_i": 2, "nabla_": [65, 66, 67], "naiv": 107, "name": [2, 8, 10, 11, 16, 17, 21, 24, 27, 38, 45, 46, 50, 52, 53, 54, 61, 63, 77, 101, 103], "named_img": 77, "named_tensor": 77, "nan": 77, "nan_to_num": 77, "nan_to_num_": 77, "nanmean": 77, "nanmedian": 77, "nanquantil": 77, "nansum": 77, "narrow": 77, "narrow_copi": 77, "natur": 60, "nbyte": 77, "ndarrai": 77, "ndim": 77, "ndimens": 77, "ne": 77, "ne_": 77, "nearli": 77, "necessari": [58, 77, 86, 100, 107], "necessarili": [3, 77], "need": [3, 46, 58, 77, 85, 91, 100, 102, 107], "needs_input_grad": 100, "neg": 77, "neg_": 77, "negative_": 77, "neginf": 77, "negligibli": 87, "neither": 100, "nelement": 77, "neq": [65, 67], "nest": [60, 77, 85, 92, 94], "nesterov": 67, "network": [6, 11, 16, 53, 61, 65, 66, 67], "neural": [6, 16, 61], "neuron": 6, "never": [77, 87], "nevertheless": 0, "new": [65, 66, 67, 73, 77, 85, 89, 90, 91, 94, 95, 96, 104, 105, 107], "new_arg": [65, 66, 67], "new_empti": 77, "new_empty_strid": 77, "new_ful": 77, "new_gelu": 92, "new_kwarg": [65, 66, 67], "new_on": 77, "new_tensor": 77, "new_zero": 77, "newli": 7, "next": [77, 107], "nextaft": 77, "nextafter_": 77, "nice": [2, 77], "nightli": 107, "nllloss": 3, "nlp": 9, "nn": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 22, 23, 24, 44, 49, 60, 65, 66, 67, 72, 74, 75, 77, 85, 89, 90, 92, 94, 95, 97, 100, 101, 102, 103, 107], "nnz": 77, "no_grad": [7, 65, 66, 67, 77], "node": [86, 87, 88, 91, 96, 101], "non": [3, 7, 12, 19, 40, 41, 47, 49, 54, 58, 60, 71, 77, 87, 100, 107], "non_block": 77, "non_recurse_funct": 94, "nondeterminist": [2, 46, 60, 77], "none": [2, 3, 4, 7, 8, 9, 10, 11, 14, 16, 17, 19, 23, 27, 37, 38, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 65, 66, 67, 70, 71, 73, 74, 77, 96, 97, 99, 100, 101, 102, 107], "nonlinear": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "nonzero": 77, "nonzero_stat": 77, "nor": 100, "norm": [2, 7, 10, 11, 14, 49, 74, 77], "norm_typ": [7, 49], "normal": [9, 15, 41, 51, 59, 77, 107], "normal_": 77, "normalis": [15, 58], "normalized_shap": [9, 15, 51, 59], "not_equ": 77, "not_equal_": 77, "note": [0, 2, 3, 4, 5, 7, 9, 10, 11, 15, 18, 46, 47, 49, 54, 60, 65, 66, 67, 77, 85, 91, 92, 94, 100, 106, 107], "notebook": [104, 107], "notic": [52, 53, 54], "noun": 90, "now": [24, 77, 91, 95, 100, 103, 107], "nse": 77, "num_channel": 77, "num_embed": 7, "num_sampl": 77, "number": [2, 3, 8, 10, 11, 12, 16, 17, 18, 19, 21, 32, 40, 41, 45, 46, 47, 49, 51, 52, 53, 59, 77, 107], "number_of_heads_kei": 60, "number_of_heads_key_valu": 60, "number_of_heads_queri": 60, "number_of_heads_valu": 60, "numel": [23, 77, 84], "numer": [9, 15, 23, 24, 49, 65, 66, 83, 89, 90, 103, 107], "numerical_accuraci": 60, "numpi": [77, 100], "object": [14, 65, 66, 67, 73, 74, 75, 77, 84, 91, 100], "observ": [3, 47, 77], "obtain": 77, "occasion": [104, 107], "occur": [65, 66, 67, 77, 107], "occurr": 77, "od": 73, "odd": 46, "offset": 77, "often": [7, 49, 77, 107], "old": [77, 107], "omit": 77, "onc": [88, 100], "once_differenti": 100, "one": [2, 8, 10, 11, 16, 17, 27, 38, 45, 46, 50, 52, 53, 54, 58, 60, 61, 63, 65, 66, 67, 77, 85, 100, 107], "ones": [7, 9, 15, 49, 60, 77], "onesid": 77, "onli": [3, 9, 24, 30, 31, 32, 33, 41, 47, 54, 60, 68, 76, 77, 80, 81, 85, 86, 89, 90, 91, 94, 100, 101, 103, 105, 107], "op": [30, 31, 32, 33, 65, 66, 67, 77, 85, 89, 90, 100, 101, 105, 107], "open": [52, 53, 54], "oper": [2, 6, 9, 15, 23, 24, 25, 46, 48, 52, 53, 54, 60, 63, 77, 79, 87, 92, 100, 102, 103, 105, 107], "opoverload": 101, "oppos": 100, "opt": 102, "optim": [3, 60, 85, 94, 95, 104], "optimis": [85, 94, 95, 104], "option": [2, 3, 7, 8, 9, 10, 11, 12, 16, 17, 18, 19, 21, 23, 24, 27, 38, 39, 45, 46, 49, 50, 52, 53, 54, 56, 57, 58, 60, 61, 62, 63, 65, 66, 67, 71, 77, 82, 84, 87, 92, 94, 96, 100, 101, 102, 103], "order": [5, 58, 60, 65, 66, 67, 73, 77, 101, 107], "ordereddict": 5, "ordin": 77, "org": 19, "orgqr": 77, "origin": [3, 16, 47, 61, 77, 81, 101], "ormqr": 77, "other": [2, 5, 20, 45, 46, 54, 60, 65, 66, 67, 77, 91, 105, 107], "otherwis": [3, 65, 66, 67, 73, 77, 100], "our": [65, 66, 92, 94, 100, 104, 105, 107], "out": [0, 2, 6, 10, 11, 45, 46, 52, 53, 54, 56, 64, 77, 100, 104, 107], "out_channel": 2, "out_dim": 100, "out_featur": [10, 11], "outer": 77, "outermost": 77, "outlin": [104, 107], "output": [2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 23, 24, 30, 31, 32, 33, 38, 45, 46, 47, 49, 50, 52, 53, 54, 60, 61, 62, 63, 74, 77, 86, 100, 101, 102, 103, 107], "output_s": 77, "output_scal": [30, 31, 32, 33, 38, 107], "outsid": 107, "over": [3, 5, 9, 15, 46, 47, 56, 57, 58, 60, 65, 66, 67, 77, 92, 100, 107], "overflow": [63, 107], "overhead": 107, "overlap": 77, "overload": 77, "overrid": [3, 47, 100], "overridden": [71, 100], "overview": [104, 107], "own": [2, 92, 94], "p": [6, 7, 14, 48, 49, 54, 60, 74, 77], "packag": 102, "pad": [2, 7, 12, 19, 46, 49, 77], "pad_mod": 77, "padding_idx": [7, 49], "padding_mod": 2, "padw": 46, "pair": [73, 77, 99], "panda": 22, "paper": [6, 9, 15, 39, 65, 66, 82, 92, 104, 107], "paradigm": 107, "parallel": 60, "param": [65, 66, 67, 68, 69, 71, 77], "param_group": [65, 66, 67, 71], "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 36, 38, 39, 45, 46, 47, 48, 49, 50, 52, 53, 54, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 71, 80, 81, 82, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 101, 102, 103, 104], "parameter": 77, "parameterdata": [14, 68, 69, 70, 71, 74, 78], "parametr": 104, "paramst": 71, "part": 60, "partial": 101, "particular": [54, 77, 92, 107], "particularli": [3, 107], "partit": 60, "pass": [5, 7, 22, 23, 24, 41, 58, 60, 65, 66, 67, 71, 76, 77, 79, 80, 81, 84, 89, 90, 91, 92, 95, 99, 100, 101, 102, 103, 107], "patch": 95, "patch_to_expand_modul": 104, "path": [21, 24, 103], "pd": 22, "pdf": 77, "peak": [65, 66, 67], "penalti": [65, 67], "pep": 76, "per": [3, 9, 15, 47, 65, 66, 67, 71, 77, 100], "perform": [2, 3, 5, 46, 60, 63, 65, 66, 67, 71, 77, 100, 107], "permut": 77, "perspect": [23, 24, 103, 107], "phi": [8, 50], "physic": 77, "pi": [8, 50, 77], "pick": 77, "pin": 77, "pin_memori": 77, "pinvers": 77, "pip": [104, 107], "pivot": 77, "pixel": 3, "place": [6, 48, 49, 77, 100, 107], "placehold": 101, "plain": 102, "plane": [2, 9, 46], "pleas": [52, 53, 54, 60, 65, 66, 77, 100, 106, 107], "plot": [22, 24, 91, 103, 104], "plot_kwarg": [24, 103], "point": [2, 3, 23, 24, 41, 49, 60, 77, 86, 87, 91, 103, 107], "pointer": 77, "poll": 77, "polygamma": 77, "polygamma_": 77, "pop": 73, "popitem": 73, "popul": [77, 101], "posinf": 77, "posit": [2, 3, 18, 77, 101], "position": 77, "possibl": [0, 49, 77, 107], "possibli": [22, 23], "post": [12, 18, 19, 65, 66, 67, 77, 106], "potenti": [2, 46, 60, 77], "pow": 77, "pow_": 77, "power": [46, 52], "practic": [23, 77, 91, 95, 107], "pre": [65, 66, 67, 101, 107], "precis": [2, 10, 11, 54, 60, 104, 107], "predict": [47, 77], "prefer": 60, "prenorm": 19, "prepend": [54, 65, 66, 67, 77], "preprocess": 77, "presenc": 76, "present": [73, 77, 96, 104, 107], "preserv": [77, 107], "preserve_format": 77, "pretrain": 7, "pretrainedtokenizerbas": [21, 24, 103], "prevent": [6, 63, 77, 100], "preview": 107, "previou": [77, 87, 92], "primarili": 76, "print": [10, 11, 102, 107], "prior": [58, 60, 100], "prioriti": 92, "probabl": [3, 6, 12, 18, 19, 47, 48, 60, 77], "problem": [3, 104, 107], "proce": 92, "procedur": 92, "process": [3, 7, 47, 65, 66, 67, 86, 92, 101], "process_input": 101, "process_output": 101, "prod": 77, "produc": [2, 77, 107], "product": [54, 60, 77], "profil": 77, "program": [77, 107], "progress": [65, 66, 67, 77], "prohibit": [65, 66, 67], "promot": 45, "promptli": 101, "proof": 92, "propag": 77, "properli": 4, "properti": [41, 77], "proportion": 77, "proto": 76, "protocol": [14, 74, 75, 78], "proven": 6, "provid": [3, 5, 24, 26, 28, 29, 30, 31, 32, 33, 60, 64, 65, 66, 67, 73, 77, 87, 91, 92, 103, 105, 107], "prune": [23, 24, 86, 87, 88, 103], "prune_non_float_tensor": [91, 104], "prune_same_scal": 23, "prune_same_scale_tensor": [91, 104], "prune_selected_nod": 104, "pruned_graph": [86, 87], "pull": 77, "purpos": [54, 77, 89, 90, 107], "push": 107, "put": [77, 107], "put_": 77, "py": 102, "python": [4, 77, 102], "python3": 102, "pytorch": [0, 20, 60, 77, 85, 92, 107], "q": 77, "q_per_channel_axi": 77, "q_per_channel_scal": 77, "q_per_channel_zero_point": 77, "q_scale": 77, "q_zero_point": 77, "qr": 77, "qscheme": 77, "qtensor": 77, "quad": 3, "qualifi": 101, "quantil": 77, "quantis": [41, 89, 90], "quantise_bwd": 41, "quantise_fwd": 41, "quantiz": 77, "queri": 60, "queu": 77, "r": [7, 49, 77, 100], "race": 77, "rad2deg": 77, "rad2deg_": 77, "rais": [27, 60, 73, 77, 100], "rand": [49, 60, 77], "randint": [47, 77], "randn": [2, 3, 6, 8, 9, 10, 11, 16, 17, 45, 46, 47, 77, 102, 107], "random": [2, 21, 46, 60, 77, 100], "random_": [3, 77], "rang": [3, 17, 24, 63, 77, 85, 89, 90, 92, 101, 103, 107], "rank": 77, "rate": [65, 66, 67, 71], "rather": 85, "ratio": [36, 39, 56, 57, 58, 77, 82, 107], "ravel": 77, "re": [24, 63, 77, 103, 104, 105, 107], "read": [77, 107], "readout_constraint": [67, 70], "real": 77, "realli": 101, "realloc": 77, "reason": [60, 91, 105], "receiv": [65, 66, 67, 107], "recent": 107, "reciproc": 77, "reciprocal_": 77, "recogn": 76, "recommend": [24, 28, 77, 103, 104, 107], "recomput": 100, "record": [77, 91, 100, 101], "record_stream": 77, "rectifi": 107, "recurs": [92, 94, 95, 97, 101, 102], "recurse_modul": 102, "reduc": [3, 47, 55, 77, 89, 90], "reduct": [3, 47, 55, 77], "reevalu": [65, 66, 67], "refer": [65, 66, 77, 104, 106, 107], "referenc": 101, "refin": 77, "refine_nam": 77, "reflect": [2, 77, 102, 106, 107], "regard": [7, 49, 65, 66, 107], "regist": [4, 5, 65, 66, 67, 77], "register_hook": 77, "register_load_state_dict_post_hook": [65, 66, 67], "register_load_state_dict_pre_hook": [65, 66, 67], "register_post_accumulate_grad_hook": 77, "register_state_dict_post_hook": [65, 66, 67], "register_state_dict_pre_hook": [65, 66, 67], "register_step_post_hook": [65, 66, 67], "register_step_pre_hook": [65, 66, 67], "regular": [4, 6, 66, 71, 92], "reinforc": [16, 61], "reinterpret": 77, "rel": [13, 19, 39, 56, 58, 77, 82, 87], "releas": [77, 85, 104, 107], "relev": [0, 9, 23, 24, 71, 103], "reli": [77, 92], "relu": [5, 102], "relu1": 5, "relu2": 5, "remain": [7, 49], "remaind": 77, "remainder_": 77, "remap": 77, "rememb": 73, "remov": [54, 65, 66, 67, 73, 77, 87, 96, 107], "removablehandl": [65, 66, 67], "removeablehandl": [65, 66, 67], "renam": 77, "rename_": 77, "rename_map": 77, "renamed_img": 77, "renorm": [7, 49, 77], "renorm_": 77, "repeat": [77, 107], "repeat_interleav": [60, 77], "replac": [0, 77, 92, 96, 107], "replace_node_with_funct": 104, "replic": 2, "repo": [104, 107], "report": 101, "repres": [23, 24, 77, 84, 88, 91, 92, 94, 96, 99, 100, 101, 102, 103, 107], "represent": [41, 77, 89, 90], "request": [52, 53, 54, 65, 66, 67, 105], "requir": [2, 3, 60, 77, 91, 92, 100, 107], "requires_grad": [3, 7, 47, 77, 100], "requires_grad_": [77, 91, 102, 107], "rescal": [3, 17, 47], "research": [104, 107], "reset": [65, 66, 67], "reshap": [23, 77, 87], "reshape_a": 77, "resid": 77, "residu": [18, 19, 39, 56, 57, 58, 82, 92, 107], "residual_add": [57, 58, 104, 107], "residual_appli": 104, "residual_attn_ratio": [39, 82], "residual_mult": [39, 82], "residual_sc": 18, "residual_split": [56, 57, 104, 107], "residuallay": 107, "resiz": 77, "resize_": 77, "resize_as_": 77, "resolve_conj": 77, "resolve_neg": 77, "respect": [49, 65, 66, 67, 77, 100], "rest": 77, "restrict": [3, 54], "result": [22, 23, 56, 58, 77, 91, 101], "retain": 96, "retain_grad": 77, "retain_graph": 77, "retains_grad": 77, "rethink": [3, 47], "retriev": [7, 27, 49, 77, 100, 101], "return": [3, 5, 14, 17, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 36, 38, 39, 43, 47, 54, 56, 58, 60, 62, 63, 65, 66, 67, 71, 73, 74, 76, 77, 80, 81, 82, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 100, 101, 102, 103, 107], "return_complex": 77, "return_count": 77, "return_invers": 77, "reus": 77, "revers": 77, "rfloor": 2, "right": [2, 31, 33, 54], "right_grad_scal": [31, 33], "rm": [15, 59, 104], "rms_norm": 104, "rmsnorm": [59, 104], "rocm": [2, 10, 11, 54], "roll": 77, "root": 15, "rot90": 77, "roughli": 107, "round": [41, 77, 89, 90], "round_": 77, "rounding_mod": 77, "routin": 77, "row": [23, 24, 49, 77, 103], "row_indic": 77, "rsqrt": 77, "rsqrt_": 77, "rtol": [77, 87], "rule": [39, 60, 64, 65, 66, 67, 77, 82], "run": [5, 22, 23, 60, 65, 66, 67, 77, 89, 90, 101, 104, 107], "run_nod": 101, "runtim": 76, "runtime_check": 76, "runtimeerror": [77, 100], "s0": 77, "s1": 77, "safe": [65, 66, 77], "safeti": 77, "sai": 77, "said": 92, "sake": [4, 5, 83, 91], "same": [2, 3, 5, 6, 8, 9, 10, 11, 16, 17, 39, 46, 47, 54, 60, 62, 77, 82, 87, 89, 90, 100, 107], "sampl": [2, 3, 6, 7, 10, 11, 21, 47, 48, 49, 60, 77], "satisfi": 77, "save": [65, 66, 67, 77, 100], "save_for_backward": 100, "save_for_forward": 100, "saved_tensor": 100, "saved_tensors_hook": 100, "saved_weight": 77, "scalar": [3, 9, 54, 77, 80, 81, 99, 107], "scale": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 77, 82, 83, 85, 86, 87, 89, 90, 91, 92, 98, 102, 103, 105], "scale_bwd": 104, "scale_channel": 77, "scale_elementwis": 104, "scale_factor": 60, "scale_fwd": 104, "scale_grad_by_freq": [7, 49], "scale_pow": [46, 52], "scaled_dot_product_attent": [89, 90, 104], "scaled_paramet": [64, 104], "scaledmlp": 107, "scalepair": [100, 104], "scales_graph": [22, 23, 24, 86, 87, 91, 103], "scaletrack": 104, "scaletrackinginterpret": 104, "scatter": 77, "scatter_": 77, "scatter_add": 77, "scatter_add_": 77, "scatter_reduc": 77, "scatter_reduce_": 77, "scenario": 107, "scheme": [18, 39, 77, 82], "scope": 107, "score": 60, "scratch": 107, "script": 107, "sdpa_kernel": 60, "sdpbackend": 60, "second": [5, 7, 54, 65, 66, 100], "secondli": 107, "section": [47, 77, 104, 107], "see": [2, 3, 7, 8, 10, 11, 16, 17, 18, 19, 38, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 76, 77, 92, 100, 101, 104, 105, 106, 107], "seed": 21, "seen": [23, 24, 77, 103, 107], "select": [2, 30, 31, 32, 33, 46, 60, 77], "select_scatt": 77, "self": [4, 12, 16, 19, 60, 61, 65, 66, 67, 76, 77, 101, 102, 107], "semant": [77, 101], "sens": 107, "sentence_length": 9, "separ": 107, "seq": 77, "seq_len": [21, 24, 103], "sequenc": [2, 5, 12, 19, 21, 24, 51, 77, 103], "sequenti": [5, 18], "set": [2, 3, 6, 9, 10, 11, 15, 39, 46, 47, 48, 60, 65, 66, 67, 71, 73, 77, 82, 84, 88, 91, 92, 100, 107], "set_": 77, "set_materialize_grad": 100, "set_to_non": [65, 66, 67], "setdefault": 73, "setup_context": 100, "sever": 46, "sgd": [64, 70, 77, 104], "sgn": 77, "sgn_": 77, "shallow": [65, 66, 67, 73], "shape": [2, 3, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 45, 46, 47, 49, 50, 52, 53, 60, 61, 62, 63, 77, 100], "share": [77, 100, 107], "share_memory_": 77, "sharper": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "shift": [21, 77], "short": 77, "shorthand": 77, "should": [3, 17, 38, 46, 47, 56, 58, 60, 65, 66, 67, 75, 77, 89, 90, 91, 92, 94, 95, 100, 101, 102, 107], "show": [23, 104, 107], "show_arrow": 23, "show_error_bar": 23, "show_zero_tensor": 23, "shown": [9, 23], "shrink": 107, "shuffl": 21, "shuffle_buffer_s": 21, "side": [2, 46, 77], "sigma": [16, 61, 62, 77], "sigmoid": [16, 61, 62, 77], "sigmoid_": 77, "sign": [77, 107], "sign_": 77, "signal": [2, 46], "signatur": [65, 66, 67, 76, 77, 100], "signbit": 77, "signific": [87, 107], "significantli": [65, 66, 67, 107], "silu": [62, 104], "silu_glu": 104, "similar": [77, 100, 107], "simpl": [76, 77], "simplefunc": 100, "simplest": 2, "simpli": [6, 107], "simul": [40, 77, 89, 90], "simulate_format": 104, "simulate_fp8": [85, 91, 92, 104, 105], "sin": 77, "sin_": 77, "sinc": [65, 66, 67, 77], "sinc_": 77, "singl": [2, 3, 5, 9, 46, 57, 65, 66, 67, 77, 85], "singleton": [9, 77], "sinh": 77, "sinh_": 77, "site": 102, "situat": 77, "size": [2, 3, 7, 9, 10, 11, 12, 13, 18, 19, 21, 24, 47, 49, 60, 77, 100, 103], "size_averag": [3, 47, 55], "sizedim": 77, "sizeof": [65, 66, 67], "skip": [19, 56, 57, 58, 65, 66, 67, 100, 107], "slice": [17, 63, 77, 87, 101], "slice_scatt": 77, "slide": 104, "slightli": [85, 87], "slogdet": 77, "small": [5, 107], "smaller": [77, 107], "smm": 77, "smooth": [3, 47], "so": [2, 17, 46, 60, 63, 65, 66, 67, 77, 85, 91, 92, 94, 100, 101, 102, 107], "softmax": [3, 12, 18, 19, 47, 60, 77, 104, 107], "solv": 107, "some": [0, 2, 3, 46, 47, 52, 53, 54, 60, 65, 66, 67, 77, 92, 104, 107], "some_comm_op": 77, "someth": [65, 66, 67, 107], "sort": [77, 100], "sound": 5, "sourc": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 36, 37, 38, 39, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 99, 100, 101, 102, 103], "sp": 104, "space": [2, 36, 46], "span": [3, 77], "spars": [7, 17, 49, 52, 53, 54, 77], "sparse_bsc": 77, "sparse_bsr": 77, "sparse_coo": 77, "sparse_coo_tensor": 77, "sparse_csc": 77, "sparse_csr": 77, "sparse_dim": 77, "sparse_mask": 77, "sparse_resize_": 77, "sparse_resize_and_clear_": 77, "sparsecsr": 77, "sparsedim": 77, "spatial": 9, "special": [77, 92, 105], "specif": [9, 60, 65, 66, 67, 77, 89, 90, 101], "specifi": [3, 7, 47, 49, 60, 63, 65, 66, 67, 73, 77, 87, 91, 100, 107], "speed": 107, "speedup": [89, 90, 107], "split": [46, 58, 77], "split_siz": 77, "split_size_or_sect": 77, "sqrt": [2, 8, 9, 10, 11, 15, 37, 50, 60, 65, 66, 77], "sqrt_": 77, "squar": [15, 55, 60, 65, 66, 77], "square_": 77, "squeez": 77, "squeeze_": 77, "srbit": 41, "src": 77, "sspaddmm": 77, "stabil": [9, 15, 65, 66], "stack": [39, 82, 107], "stage": 92, "stai": [77, 107], "standard": [0, 9, 17, 24, 40, 77, 85, 90, 92, 94, 99, 100, 101, 102, 103, 104, 107], "star": 2, "start": [39, 77, 82, 101, 107], "start_dim": 77, "state": [65, 66, 67, 77], "state_dict": [65, 66, 67], "static": [76, 100], "static_s": 77, "staticmethod": 100, "statist": 9, "std": [23, 77, 84], "steadili": 107, "step": [65, 66, 67, 71, 77, 107], "stft": 77, "still": [58, 77, 85, 92, 100, 105], "stochast": [41, 65, 89, 90], "storag": [77, 100], "storage_offset": 77, "storage_typ": 77, "store": [5, 7, 77, 101], "str": [2, 3, 8, 10, 11, 16, 17, 21, 23, 24, 27, 38, 41, 45, 46, 47, 50, 52, 53, 54, 55, 61, 63, 65, 66, 67, 70, 71, 77, 88, 101, 102, 103], "stream": 77, "strictli": 77, "stride": [2, 46, 77], "string": [2, 46, 77, 102], "strongli": 107, "structur": [76, 100, 101], "sub": 77, "sub_": 77, "subclass": [77, 100], "subject": [60, 77], "submodul": 5, "subnorm": 41, "subsequ": [2, 5], "subspac": 77, "substanti": 107, "substitut": 92, "subtract": 77, "subtract_": 77, "subtyp": 76, "success": 77, "successfulli": 85, "suffici": [77, 92, 107], "suitabl": 77, "sum": [3, 15, 17, 47, 63, 77], "sum_": [2, 3], "sum_j": [17, 63], "sum_to_s": 77, "super": [60, 102, 107], "suppli": [2, 86, 87, 89, 92, 94, 100, 107], "support": [0, 2, 3, 6, 7, 10, 11, 14, 24, 45, 46, 47, 48, 49, 52, 53, 54, 60, 65, 66, 67, 74, 75, 77, 78, 85, 92, 94, 100, 103], "sure": 60, "suspect": 105, "svd": 77, "sw": 46, "swap": [0, 77, 107], "swap_tensor": 77, "swapax": 77, "swapaxes_": 77, "swapdim": 77, "swapdims_": 77, "swiglu": 13, "swish": [16, 61], "switch": [65, 66, 67], "symbool": 101, "symfloat": 101, "symint": 101, "sync": 77, "synchron": 77, "syntax_highlight": 102, "system": [92, 94], "t": [2, 7, 23, 24, 43, 46, 49, 52, 53, 65, 66, 67, 76, 77, 85, 86, 87, 92, 94, 95, 97, 100, 103, 105, 107], "t_": 77, "tabl": [7, 49], "tag": [71, 75], "take": [38, 60, 77, 92, 107], "take_along_dim": 77, "taken": [3, 107], "tall": 77, "tan": 77, "tan_": 77, "tanh": [8, 50, 77], "tanh_": 77, "target": [3, 47, 55, 67, 77, 88, 96, 101, 107], "target_fn": 96, "target_to_funct": 101, "tau": [39, 56, 57, 58, 67, 82, 107], "techniqu": [6, 107], "tell": 77, "temp_mask": 60, "temperatur": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "templat": [24, 103], "tend": [91, 107], "tensor": [2, 3, 7, 10, 11, 14, 15, 17, 21, 23, 24, 37, 38, 41, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 71, 74, 80, 81, 84, 86, 87, 91, 94, 99, 100, 101, 102, 103, 107], "tensor1": 77, "tensor2": 77, "tensor_split": 77, "tensorfloat32": [2, 10, 11, 46, 52, 53, 54], "tensorlist": [65, 66, 67], "term": [65, 66, 77, 107], "test": [92, 105, 107], "text": [2, 3, 7, 8, 9, 10, 11, 16, 17, 21, 45, 46, 47, 50, 61, 62, 63, 65, 66, 67, 77], "textbf": [65, 66, 67], "textit": [65, 66, 67], "texttt": 77, "th": 77, "than": [2, 7, 46, 49, 58, 60, 65, 66, 67, 77, 85, 91, 92, 101], "thei": [0, 5, 23, 24, 65, 66, 67, 76, 77, 92, 100, 103, 104, 107], "them": [7, 17, 63, 77, 92, 100, 101, 105], "themselv": 107, "theori": [77, 107], "thereaft": [85, 94], "therefor": [7, 49, 77], "theta": [65, 66, 67], "theta_": [65, 66, 67], "theta_0": [65, 66, 67], "theta_t": [65, 66, 67], "thi": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 21, 22, 23, 24, 27, 38, 39, 41, 45, 46, 48, 49, 50, 52, 53, 54, 58, 60, 61, 63, 65, 66, 67, 71, 73, 75, 77, 82, 85, 86, 87, 89, 90, 91, 92, 94, 95, 97, 100, 101, 103, 104, 105, 106, 107], "thin": 0, "think": 77, "thorough": 107, "those": [0, 3, 47, 77, 89, 90, 104, 107], "though": [54, 58, 77, 85, 94, 105, 107], "three": [9, 31, 33, 60], "through": [65, 66, 67, 77, 100, 101, 105, 107], "throw": 77, "thrown": [60, 77], "thu": [54, 107], "tile": 77, "time": [2, 9, 45, 54, 65, 66, 67, 77, 107], "titl": 23, "to_dens": 77, "to_grad_input_scal": [2, 8, 10, 11, 16, 17, 38, 46, 50, 52, 53, 61, 63, 104], "to_left_grad_scal": [45, 54, 104], "to_mkldnn": 77, "to_output_scal": [2, 8, 10, 11, 16, 17, 38, 45, 46, 50, 52, 53, 54, 61, 63, 104], "to_padded_tensor": 77, "to_right_grad_scal": [45, 54, 104], "to_spars": 77, "to_sparse_bsc": 77, "to_sparse_bsr": 77, "to_sparse_coo": 77, "to_sparse_csc": 77, "to_sparse_csr": 77, "togeth": [56, 107], "toggl": 102, "token": [18, 21, 24, 103], "told": 92, "toler": 87, "tolist": 77, "too": [3, 60, 77, 89, 90], "tool": [20, 107], "top": [3, 92], "topk": 77, "torch": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 22, 23, 24, 44, 45, 46, 47, 49, 54, 60, 63, 65, 66, 67, 69, 70, 71, 72, 74, 75, 77, 83, 85, 89, 90, 91, 92, 94, 95, 97, 100, 102, 103, 105, 107], "torch_doctest_autograd": 100, "torch_nn_modules_to_user_modul": [95, 104], "torchdynamo": [89, 90, 92, 94, 95, 97], "torchscript": [77, 107], "total": [39, 77, 82], "touch": 105, "trace": [77, 92], "tracer": 101, "track": [4, 5, 23, 24, 77, 86, 91, 103], "track_scal": [22, 23, 24, 86, 87, 103, 104], "tracked_model": [24, 103], "tradeoff": 77, "trail": 15, "train": [3, 6, 7, 9, 48, 49, 58, 60, 65, 66, 67, 104, 105, 107], "trainabl": [65, 66, 67], "transform": [5, 9, 10, 11, 18, 19, 22, 23, 24, 38, 39, 52, 53, 64, 65, 66, 67, 77, 82, 103, 104, 105, 107], "transformer_residual_scaling_rul": [18, 104], "transformerdecod": 104, "transformerlay": 104, "transpos": [60, 77], "transpose_": 77, "treat": [5, 9, 17, 77], "tri": [65, 66, 67, 77], "trial": 77, "triangular": 60, "triangular_solv": 77, "trick": 107, "tricki": 107, "trigger": 77, "tril": [60, 77], "tril_": 77, "trip": 107, "triu": 77, "triu_": 77, "trivial_subclass": 97, "trou": 2, "true": [2, 3, 6, 7, 9, 10, 11, 15, 23, 24, 46, 47, 48, 49, 60, 65, 66, 67, 71, 73, 77, 89, 90, 96, 100, 101, 102, 103], "true_divid": 77, "true_divide_": 77, "trunc": 77, "trunc_": 77, "truncat": 77, "trunk": 18, "truth": [3, 47], "try": [2, 46, 60, 65, 66, 67, 107], "tune": [65, 66, 67, 107], "tupl": [2, 15, 21, 27, 37, 42, 43, 46, 52, 58, 59, 65, 66, 67, 77, 96, 100, 101, 102, 107], "tuple_to_format": 104, "tutori": [100, 107], "twice": 77, "two": [2, 3, 30, 47, 54, 58, 60, 65, 66, 67, 77, 85, 100, 107], "type": [2, 7, 10, 11, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 36, 38, 39, 45, 46, 49, 56, 58, 60, 62, 63, 65, 66, 67, 71, 76, 77, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 101, 102, 103], "type_a": 77, "typedstorag": 77, "typeguard": 78, "typic": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63, 77, 95, 97, 107], "u": [0, 2, 10, 11, 14, 39, 64, 65, 66, 67, 71, 72, 74, 75, 82, 92, 102, 104, 106, 107], "uint8": 77, "uint8_t": 77, "ultim": 92, "unaffect": 77, "unbalanc": 3, "unbatch": 3, "unbias": 9, "unbind": 77, "unchang": [77, 80], "unclear": 77, "uncoalesc": 77, "unconstrain": 92, "undefin": [77, 100], "under": 49, "underflow": 107, "underli": [60, 77, 85], "underneath": 100, "underscor": 77, "understand": 107, "undesir": [2, 46, 60], "unexpectedli": 77, "unflatten": 77, "unfold": 77, "ungraph": [65, 66], "uniform": [3, 47, 77], "uniform_": 77, "uniniti": 77, "union": 102, "uniqu": 77, "unique_consecut": 77, "unit": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 25, 28, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 71, 83, 85, 89, 90, 91, 92, 98, 102, 105], "unit_sc": [0, 104, 105, 107], "unit_scal": [85, 91, 104, 105], "unit_scaled_funct": 107, "unitriangular": 77, "unitscaledmodul": 107, "unless": [77, 100], "unlik": [9, 23, 24, 77, 103, 107], "unment": 77, "unnam": 77, "unnorm": [3, 47], "unreduc": 3, "unsafe_chunk": 77, "unsafe_split": 77, "unscal": 107, "unscaledmlp": 107, "unspecifi": [17, 65, 66, 67, 77], "unsqueez": 77, "unsqueeze_": 77, "until": [77, 107], "untyped_storag": 77, "untypedstorag": 77, "up": [7, 49, 77, 100, 101, 107], "updat": [7, 49, 71, 73, 77, 104, 107], "upgrad": 107, "upper": [36, 60, 77], "us": [0, 2, 3, 5, 7, 8, 9, 10, 11, 13, 19, 21, 22, 24, 25, 27, 29, 41, 46, 48, 49, 54, 56, 57, 58, 60, 63, 65, 66, 67, 76, 77, 83, 84, 85, 87, 89, 90, 91, 92, 94, 95, 97, 100, 101, 102, 103, 104, 105, 107], "usag": [18, 34, 77], "use_deterministic_algorithm": 77, "user": [24, 60, 65, 66, 67, 77, 89, 90, 91, 92, 94, 97, 100, 103, 104], "usual": [65, 66, 67, 77, 107], "util": [65, 66, 67, 77, 104, 107], "uu": [0, 14, 71, 74, 107], "v": [49, 65, 66, 67, 73, 77], "v1": [21, 24, 103], "v_": [65, 66], "v_0": [65, 66], "v_t": [65, 66], "valid": [2, 46, 54, 77, 92, 107], "valu": [2, 3, 5, 7, 9, 10, 11, 15, 17, 23, 24, 36, 41, 46, 47, 56, 57, 58, 60, 65, 66, 67, 71, 73, 77, 86, 89, 90, 92, 100, 101, 103, 107], "valueerror": 27, "var": [9, 77], "variabl": [77, 100], "varianc": [39, 82], "variant": [65, 66], "variou": [89, 91], "vdot": 77, "ve": 105, "vec": 77, "vec1": 77, "vec2": 77, "vector": [7, 49, 54, 77], "veri": 107, "verif": [65, 66, 67], "versa": 77, "version": [1, 24, 38, 44, 54, 65, 66, 67, 77, 85, 91, 92, 94, 95, 97, 103, 107], "versu": 60, "via": [9, 77, 86, 87, 91, 92, 101, 107], "vice": 77, "video": [77, 104], "view": [23, 73, 77], "view_a": 77, "visibl": 4, "vision": [3, 47], "visual": 2, "visualis": [21, 23, 91, 104], "vjp": 100, "vmap": 100, "vocab_s": 18, "vocabulari": 18, "vsplit": 77, "w": [3, 7, 9, 49, 77, 100], "w_": 3, "w_c": 3, "wa": [16, 61, 77, 101], "wai": [5, 77, 85, 92, 100, 107], "wait": 77, "wait_stream": 77, "want": [77, 105], "warn": [19, 24, 60, 103], "we": [2, 24, 65, 66, 67, 77, 87, 90, 92, 94, 100, 101, 103, 104, 105, 107], "weight": [2, 3, 7, 9, 10, 11, 14, 15, 16, 18, 19, 36, 46, 47, 49, 51, 52, 53, 56, 57, 58, 59, 60, 61, 65, 66, 67, 71, 74, 77, 89, 92, 102, 107], "weight_decai": [65, 66, 67, 71], "weight_mup_typ": [2, 10, 11], "welcom": 104, "well": [3, 65, 66, 67, 77, 105, 107], "were": [77, 100], "weren": 100, "what": [2, 5, 60, 65, 66, 67, 86, 100, 104], "whatev": [60, 77], "when": [2, 3, 5, 8, 9, 10, 11, 15, 17, 46, 47, 50, 54, 60, 65, 66, 67, 77, 89, 90, 91, 95, 97, 101, 107], "where": [2, 3, 7, 8, 9, 10, 11, 16, 17, 47, 49, 50, 52, 53, 54, 60, 61, 62, 65, 66, 67, 77, 89, 91, 92, 101, 107], "wherea": 77, "whether": [65, 66, 67, 77, 92, 100], "which": [0, 3, 5, 9, 13, 17, 24, 58, 60, 63, 65, 66, 67, 71, 77, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 100, 101, 103, 107], "while": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63, 77, 100], "who": [104, 105, 107], "whole": 5, "whose": [77, 102], "why": [60, 107], "wide": [24, 92, 103], "widehat": [65, 66], "width": 77, "wikitext": [21, 24, 103], "win_length": 77, "window": 77, "wise": [38, 50, 55], "wish": [94, 104, 107], "within": [20, 77, 85, 94, 101, 107], "without": [65, 66, 67, 77, 85, 102, 105, 107], "won": 77, "word": [2, 7, 49], "work": [24, 60, 77, 85, 93, 103, 104, 105, 107], "would": [77, 100], "wouldn": 77, "wrap": [77, 95, 101, 102, 107], "wrapper": [0, 64, 92, 94], "write": 77, "written": 85, "wrong": 100, "wrt": 77, "x": [3, 8, 9, 15, 16, 23, 37, 41, 50, 61, 62, 76, 77, 92, 100, 102, 107], "x86_64": 102, "x_": [3, 17, 63], "x_i": [17, 63], "x_j": [17, 63], "x_npy": 100, "x_t": 100, "xa": [52, 53], "xavier": [29, 107], "xdoctest": [7, 49, 67, 100], "xla": 77, "xlogi": 77, "xlogy_": 77, "xmax": 23, "xmin": 23, "xpu": 77, "y": [3, 9, 15, 52, 53, 77, 100], "y_": 3, "y_n": 3, "y_t": 100, "yet": [65, 66, 77, 92, 104, 107], "you": [2, 3, 46, 52, 53, 54, 60, 65, 66, 67, 77, 100, 101, 105, 107], "your": [77, 100, 107], "z": [77, 100], "zero": [2, 6, 7, 9, 14, 48, 49, 60, 65, 66, 67, 74, 77, 92, 100], "zero_": [49, 77], "zero_grad": [65, 66, 67], "zero_point": 77, "zeros_lik": 100, "zip": [65, 66, 67], "\u00b5p": 104, "\u00e0": 2, "\u03bcp": [14, 39, 72, 74, 75, 82, 104, 106]}, "titles": ["3. API reference", "3.1. unit_scaling", "3.1.4. unit_scaling.Conv1d", "3.1.5. unit_scaling.CrossEntropyLoss", "3.1.6. unit_scaling.DepthModuleList", "3.1.7. unit_scaling.DepthSequential", "3.1.8. unit_scaling.Dropout", "3.1.9. unit_scaling.Embedding", "3.1.10. unit_scaling.GELU", "3.1.11. unit_scaling.LayerNorm", "3.1.12. unit_scaling.Linear", "3.1.13. unit_scaling.LinearReadout", "3.1.14. unit_scaling.MHSA", "3.1.15. unit_scaling.MLP", "3.1.1. unit_scaling.Parameter", "3.1.16. unit_scaling.RMSNorm", "3.1.17. unit_scaling.SiLU", "3.1.18. unit_scaling.Softmax", "3.1.19. unit_scaling.TransformerDecoder", "3.1.20. unit_scaling.TransformerLayer", "3.2. unit_scaling.analysis", "3.2.1. unit_scaling.analysis.example_batch", "3.2.2. unit_scaling.analysis.graph_to_dataframe", "3.2.3. unit_scaling.analysis.plot", "3.2.4. unit_scaling.analysis.visualiser", "3.3. unit_scaling.constraints", "3.3.1. unit_scaling.constraints.amean", "3.3.2. unit_scaling.constraints.apply_constraint", "3.3.3. unit_scaling.constraints.gmean", "3.3.4. unit_scaling.constraints.hmean", "3.3.5. unit_scaling.constraints.to_grad_input_scale", "3.3.6. unit_scaling.constraints.to_left_grad_scale", "3.3.7. unit_scaling.constraints.to_output_scale", "3.3.8. unit_scaling.constraints.to_right_grad_scale", "3.1.21. unit_scaling.core", "3.1.21.1. unit_scaling.core.functional", "3.1.21.1.1. unit_scaling.core.functional.logarithmic_interpolation", "3.1.21.1.2. unit_scaling.core.functional.rms", "3.1.21.1.3. unit_scaling.core.functional.scale_elementwise", "3.1.21.1.4. unit_scaling.core.functional.transformer_residual_scaling_rule", "3.4. unit_scaling.formats", "3.4.3. unit_scaling.formats.FPFormat", "3.4.1. unit_scaling.formats.format_to_tuple", "3.4.2. unit_scaling.formats.tuple_to_format", "3.1.22. unit_scaling.functional", "3.1.22.1. unit_scaling.functional.add", "3.1.22.2. unit_scaling.functional.conv1d", "3.1.22.3. unit_scaling.functional.cross_entropy", "3.1.22.4. unit_scaling.functional.dropout", "3.1.22.5. unit_scaling.functional.embedding", "3.1.22.6. unit_scaling.functional.gelu", "3.1.22.7. unit_scaling.functional.layer_norm", "3.1.22.8. unit_scaling.functional.linear", "3.1.22.9. unit_scaling.functional.linear_readout", "3.1.22.10. unit_scaling.functional.matmul", "3.1.22.11. unit_scaling.functional.mse_loss", "3.1.22.12. unit_scaling.functional.residual_add", "3.1.22.13. unit_scaling.functional.residual_apply", "3.1.22.14. unit_scaling.functional.residual_split", "3.1.22.15. unit_scaling.functional.rms_norm", "3.1.22.16. unit_scaling.functional.scaled_dot_product_attention", "3.1.22.17. unit_scaling.functional.silu", "3.1.22.18. unit_scaling.functional.silu_glu", "3.1.22.19. unit_scaling.functional.softmax", "3.1.23. unit_scaling.optim", "3.1.23.5. unit_scaling.optim.Adam", "3.1.23.6. unit_scaling.optim.AdamW", "3.1.23.7. unit_scaling.optim.SGD", "3.1.23.1. unit_scaling.optim.lr_scale_for_depth", "3.1.23.2. unit_scaling.optim.lr_scale_func_adam", "3.1.23.3. unit_scaling.optim.lr_scale_func_sgd", "3.1.23.4. unit_scaling.optim.scaled_parameters", "3.1.24. unit_scaling.parameter", "3.1.24.3. unit_scaling.parameter.OrderedDict", "3.1.24.1. unit_scaling.parameter.Parameter", "3.1.24.4. unit_scaling.parameter.ParameterData", "3.1.24.5. unit_scaling.parameter.Protocol", "3.1.24.6. unit_scaling.parameter.Tensor", "3.1.24.2. unit_scaling.parameter.has_parameter_data", "3.5. unit_scaling.scale", "3.5.1. unit_scaling.scale.scale_bwd", "3.5.2. unit_scaling.scale.scale_fwd", "3.1.2. unit_scaling.transformer_residual_scaling_rule", "3.6. unit_scaling.transforms", "3.6.9. unit_scaling.transforms.Metrics", "3.6.1. unit_scaling.transforms.compile", "3.6.2. unit_scaling.transforms.prune_non_float_tensors", "3.6.3. unit_scaling.transforms.prune_same_scale_tensors", "3.6.4. unit_scaling.transforms.prune_selected_nodes", "3.6.5. unit_scaling.transforms.simulate_format", "3.6.6. unit_scaling.transforms.simulate_fp8", "3.6.7. unit_scaling.transforms.track_scales", "3.6.8. unit_scaling.transforms.unit_scale", "3.7. unit_scaling.transforms.utils", "3.7.1. unit_scaling.transforms.utils.apply_transform", "3.7.2. unit_scaling.transforms.utils.patch_to_expand_modules", "3.7.3. unit_scaling.transforms.utils.replace_node_with_function", "3.7.4. unit_scaling.transforms.utils.torch_nn_modules_to_user_modules", "3.8. unit_scaling.utils", "3.8.2. unit_scaling.utils.ScalePair", "3.8.3. unit_scaling.utils.ScaleTracker", "3.8.4. unit_scaling.utils.ScaleTrackingInterpreter", "3.8.1. unit_scaling.utils.analyse_module", "3.1.3. unit_scaling.visualiser", "Unit Scaling", "2. Limitations", "Almost-scaled dot-product attention", "1. User guide"], "titleterms": {"adam": 65, "adamw": 66, "add": 45, "almost": 106, "amean": 26, "analyse_modul": 102, "analysi": [20, 21, 22, 23, 24], "api": 0, "apply_constraint": 27, "apply_transform": 94, "attent": 106, "compil": 85, "consider": 107, "constraint": [25, 26, 27, 28, 29, 30, 31, 32, 33], "content": 104, "conv1d": [2, 46], "core": [34, 35, 36, 37, 38, 39], "cross_entropi": 47, "crossentropyloss": 3, "depthmodulelist": 4, "depthsequenti": 5, "develop": 104, "dot": 106, "dropout": [6, 48], "embed": [7, 49], "example_batch": 21, "format": [40, 41, 42, 43], "format_to_tupl": 42, "fpformat": 41, "function": [35, 36, 37, 38, 39, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63], "gelu": [8, 50], "get": 104, "gmean": 28, "graph_to_datafram": 22, "guid": 107, "has_parameter_data": 78, "hmean": 29, "how": 107, "i": 107, "instal": [104, 107], "kei": 107, "layer_norm": 51, "layernorm": 9, "limit": 105, "linear": [10, 52], "linear_readout": 53, "linearreadout": 11, "logarithmic_interpol": 36, "lr_scale_for_depth": 68, "lr_scale_func_adam": 69, "lr_scale_func_sgd": 70, "matmul": 54, "metric": 84, "mhsa": 12, "mlp": 13, "model": 107, "mse_loss": 55, "optim": [64, 65, 66, 67, 68, 69, 70, 71], "optimis": 107, "ordereddict": 73, "paramet": [14, 72, 73, 74, 75, 76, 77, 78], "parameterdata": 75, "patch_to_expand_modul": 95, "plot": 23, "product": 106, "protocol": 76, "prune_non_float_tensor": 86, "prune_same_scale_tensor": 87, "prune_selected_nod": 88, "refer": 0, "replace_node_with_funct": 96, "residual_add": 56, "residual_appli": 57, "residual_split": 58, "rm": 37, "rms_norm": 59, "rmsnorm": 15, "scale": [79, 80, 81, 104, 106, 107], "scale_bwd": 80, "scale_elementwis": 38, "scale_fwd": 81, "scaled_dot_product_attent": 60, "scaled_paramet": 71, "scalepair": 99, "scaletrack": 100, "scaletrackinginterpret": 101, "sgd": 67, "silu": [16, 61], "silu_glu": 62, "simulate_format": 89, "simulate_fp8": 90, "softmax": [17, 63], "start": 104, "tensor": 77, "to_grad_input_scal": 30, "to_left_grad_scal": 31, "to_output_scal": 32, "to_right_grad_scal": 33, "torch_nn_modules_to_user_modul": 97, "track_scal": 91, "transform": [83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97], "transformer_residual_scaling_rul": [39, 82], "transformerdecod": 18, "transformerlay": 19, "tuple_to_format": 43, "unit": [104, 107], "unit_sc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103], "unit_scal": 92, "user": 107, "util": [93, 94, 95, 96, 97, 98, 99, 100, 101, 102], "visualis": [24, 103], "what": 107}}) \ No newline at end of file +Search.setIndex({"alltitles": {"API reference": [[0, null]], "Almost-scaled dot-product attention": [[106, null]], "Contents": [[104, null]], "Development": [[104, "development"]], "Getting Started": [[104, "getting-started"]], "How to unit-scale a model": [[107, "how-to-unit-scale-a-model"]], "Installation": [[104, "installation"], [107, "installation"]], "Key considerations for unit scaling": [[107, "key-considerations-for-unit-scaling"]], "Limitations": [[105, null]], "Optimising unit-scaled models": [[107, "optimising-unit-scaled-models"]], "Unit Scaling": [[104, null]], "User guide": [[107, null]], "What is unit scaling?": [[107, "what-is-unit-scaling"]], "unit_scaling": [[1, null]], "unit_scaling.Conv1d": [[2, null]], "unit_scaling.CrossEntropyLoss": [[3, null]], "unit_scaling.DepthModuleList": [[4, null]], "unit_scaling.DepthSequential": [[5, null]], "unit_scaling.Dropout": [[6, null]], "unit_scaling.Embedding": [[7, null]], "unit_scaling.GELU": [[8, null]], "unit_scaling.LayerNorm": [[9, null]], "unit_scaling.Linear": [[10, null]], "unit_scaling.LinearReadout": [[11, null]], "unit_scaling.MHSA": [[12, null]], "unit_scaling.MLP": [[13, null]], "unit_scaling.Parameter": [[14, null]], "unit_scaling.RMSNorm": [[15, null]], "unit_scaling.SiLU": [[16, null]], "unit_scaling.Softmax": [[17, null]], "unit_scaling.TransformerDecoder": [[18, null]], "unit_scaling.TransformerLayer": [[19, null]], "unit_scaling.analysis": [[20, null]], "unit_scaling.analysis.example_batch": [[21, null]], "unit_scaling.analysis.graph_to_dataframe": [[22, null]], "unit_scaling.analysis.plot": [[23, null]], "unit_scaling.analysis.visualiser": [[24, null]], "unit_scaling.constraints": [[25, null]], "unit_scaling.constraints.amean": [[26, null]], "unit_scaling.constraints.apply_constraint": [[27, null]], "unit_scaling.constraints.gmean": [[28, null]], "unit_scaling.constraints.hmean": [[29, null]], "unit_scaling.constraints.to_grad_input_scale": [[30, null]], "unit_scaling.constraints.to_left_grad_scale": [[31, null]], "unit_scaling.constraints.to_output_scale": [[32, null]], "unit_scaling.constraints.to_right_grad_scale": [[33, null]], "unit_scaling.core": [[34, null]], "unit_scaling.core.functional": [[35, null]], "unit_scaling.core.functional.logarithmic_interpolation": [[36, null]], "unit_scaling.core.functional.rms": [[37, null]], "unit_scaling.core.functional.scale_elementwise": [[38, null]], "unit_scaling.core.functional.transformer_residual_scaling_rule": [[39, null]], "unit_scaling.formats": [[40, null]], "unit_scaling.formats.FPFormat": [[41, null]], "unit_scaling.formats.format_to_tuple": [[42, null]], "unit_scaling.formats.tuple_to_format": [[43, null]], "unit_scaling.functional": [[44, null]], "unit_scaling.functional.add": [[45, null]], "unit_scaling.functional.conv1d": [[46, null]], "unit_scaling.functional.cross_entropy": [[47, null]], "unit_scaling.functional.dropout": [[48, null]], "unit_scaling.functional.embedding": [[49, null]], "unit_scaling.functional.gelu": [[50, null]], "unit_scaling.functional.layer_norm": [[51, null]], "unit_scaling.functional.linear": [[52, null]], "unit_scaling.functional.linear_readout": [[53, null]], "unit_scaling.functional.matmul": [[54, null]], "unit_scaling.functional.mse_loss": [[55, null]], "unit_scaling.functional.residual_add": [[56, null]], "unit_scaling.functional.residual_apply": [[57, null]], "unit_scaling.functional.residual_split": [[58, null]], "unit_scaling.functional.rms_norm": [[59, null]], "unit_scaling.functional.scaled_dot_product_attention": [[60, null]], "unit_scaling.functional.silu": [[61, null]], "unit_scaling.functional.silu_glu": [[62, null]], "unit_scaling.functional.softmax": [[63, null]], "unit_scaling.optim": [[64, null]], "unit_scaling.optim.Adam": [[65, null]], "unit_scaling.optim.AdamW": [[66, null]], "unit_scaling.optim.SGD": [[67, null]], "unit_scaling.optim.lr_scale_for_depth": [[68, null]], "unit_scaling.optim.lr_scale_func_adam": [[69, null]], "unit_scaling.optim.lr_scale_func_sgd": [[70, null]], "unit_scaling.optim.scaled_parameters": [[71, null]], "unit_scaling.parameter": [[72, null]], "unit_scaling.parameter.OrderedDict": [[73, null]], "unit_scaling.parameter.Parameter": [[74, null]], "unit_scaling.parameter.ParameterData": [[75, null]], "unit_scaling.parameter.Protocol": [[76, null]], "unit_scaling.parameter.Tensor": [[77, null]], "unit_scaling.parameter.has_parameter_data": [[78, null]], "unit_scaling.scale": [[79, null]], "unit_scaling.scale.scale_bwd": [[80, null]], "unit_scaling.scale.scale_fwd": [[81, null]], "unit_scaling.transformer_residual_scaling_rule": [[82, null]], "unit_scaling.transforms": [[83, null]], "unit_scaling.transforms.Metrics": [[84, null]], "unit_scaling.transforms.compile": [[85, null]], "unit_scaling.transforms.prune_non_float_tensors": [[86, null]], "unit_scaling.transforms.prune_same_scale_tensors": [[87, null]], "unit_scaling.transforms.prune_selected_nodes": [[88, null]], "unit_scaling.transforms.simulate_format": [[89, null]], "unit_scaling.transforms.simulate_fp8": [[90, null]], "unit_scaling.transforms.track_scales": [[91, null]], "unit_scaling.transforms.unit_scale": [[92, null]], "unit_scaling.transforms.utils": [[93, null]], "unit_scaling.transforms.utils.apply_transform": [[94, null]], "unit_scaling.transforms.utils.patch_to_expand_modules": [[95, null]], "unit_scaling.transforms.utils.replace_node_with_function": [[96, null]], "unit_scaling.transforms.utils.torch_nn_modules_to_user_modules": [[97, null]], "unit_scaling.utils": [[98, null]], "unit_scaling.utils.ScalePair": [[99, null]], "unit_scaling.utils.ScaleTracker": [[100, null]], "unit_scaling.utils.ScaleTrackingInterpreter": [[101, null]], "unit_scaling.utils.analyse_module": [[102, null]], "unit_scaling.visualiser": [[103, null]]}, "docnames": ["api_reference", "generated/unit_scaling", "generated/unit_scaling.Conv1d", "generated/unit_scaling.CrossEntropyLoss", "generated/unit_scaling.DepthModuleList", "generated/unit_scaling.DepthSequential", "generated/unit_scaling.Dropout", "generated/unit_scaling.Embedding", "generated/unit_scaling.GELU", "generated/unit_scaling.LayerNorm", "generated/unit_scaling.Linear", "generated/unit_scaling.LinearReadout", "generated/unit_scaling.MHSA", "generated/unit_scaling.MLP", "generated/unit_scaling.Parameter", "generated/unit_scaling.RMSNorm", "generated/unit_scaling.SiLU", "generated/unit_scaling.Softmax", "generated/unit_scaling.TransformerDecoder", "generated/unit_scaling.TransformerLayer", "generated/unit_scaling.analysis", "generated/unit_scaling.analysis.example_batch", "generated/unit_scaling.analysis.graph_to_dataframe", "generated/unit_scaling.analysis.plot", "generated/unit_scaling.analysis.visualiser", "generated/unit_scaling.constraints", "generated/unit_scaling.constraints.amean", "generated/unit_scaling.constraints.apply_constraint", "generated/unit_scaling.constraints.gmean", "generated/unit_scaling.constraints.hmean", "generated/unit_scaling.constraints.to_grad_input_scale", "generated/unit_scaling.constraints.to_left_grad_scale", "generated/unit_scaling.constraints.to_output_scale", "generated/unit_scaling.constraints.to_right_grad_scale", "generated/unit_scaling.core", "generated/unit_scaling.core.functional", "generated/unit_scaling.core.functional.logarithmic_interpolation", "generated/unit_scaling.core.functional.rms", "generated/unit_scaling.core.functional.scale_elementwise", "generated/unit_scaling.core.functional.transformer_residual_scaling_rule", "generated/unit_scaling.formats", "generated/unit_scaling.formats.FPFormat", "generated/unit_scaling.formats.format_to_tuple", "generated/unit_scaling.formats.tuple_to_format", "generated/unit_scaling.functional", "generated/unit_scaling.functional.add", "generated/unit_scaling.functional.conv1d", "generated/unit_scaling.functional.cross_entropy", "generated/unit_scaling.functional.dropout", "generated/unit_scaling.functional.embedding", "generated/unit_scaling.functional.gelu", "generated/unit_scaling.functional.layer_norm", "generated/unit_scaling.functional.linear", "generated/unit_scaling.functional.linear_readout", "generated/unit_scaling.functional.matmul", "generated/unit_scaling.functional.mse_loss", "generated/unit_scaling.functional.residual_add", "generated/unit_scaling.functional.residual_apply", "generated/unit_scaling.functional.residual_split", "generated/unit_scaling.functional.rms_norm", "generated/unit_scaling.functional.scaled_dot_product_attention", "generated/unit_scaling.functional.silu", "generated/unit_scaling.functional.silu_glu", "generated/unit_scaling.functional.softmax", "generated/unit_scaling.optim", "generated/unit_scaling.optim.Adam", "generated/unit_scaling.optim.AdamW", "generated/unit_scaling.optim.SGD", "generated/unit_scaling.optim.lr_scale_for_depth", "generated/unit_scaling.optim.lr_scale_func_adam", "generated/unit_scaling.optim.lr_scale_func_sgd", "generated/unit_scaling.optim.scaled_parameters", "generated/unit_scaling.parameter", "generated/unit_scaling.parameter.OrderedDict", "generated/unit_scaling.parameter.Parameter", "generated/unit_scaling.parameter.ParameterData", "generated/unit_scaling.parameter.Protocol", "generated/unit_scaling.parameter.Tensor", "generated/unit_scaling.parameter.has_parameter_data", "generated/unit_scaling.scale", "generated/unit_scaling.scale.scale_bwd", "generated/unit_scaling.scale.scale_fwd", "generated/unit_scaling.transformer_residual_scaling_rule", "generated/unit_scaling.transforms", "generated/unit_scaling.transforms.Metrics", "generated/unit_scaling.transforms.compile", "generated/unit_scaling.transforms.prune_non_float_tensors", "generated/unit_scaling.transforms.prune_same_scale_tensors", "generated/unit_scaling.transforms.prune_selected_nodes", "generated/unit_scaling.transforms.simulate_format", "generated/unit_scaling.transforms.simulate_fp8", "generated/unit_scaling.transforms.track_scales", "generated/unit_scaling.transforms.unit_scale", "generated/unit_scaling.transforms.utils", "generated/unit_scaling.transforms.utils.apply_transform", "generated/unit_scaling.transforms.utils.patch_to_expand_modules", "generated/unit_scaling.transforms.utils.replace_node_with_function", "generated/unit_scaling.transforms.utils.torch_nn_modules_to_user_modules", "generated/unit_scaling.utils", "generated/unit_scaling.utils.ScalePair", "generated/unit_scaling.utils.ScaleTracker", "generated/unit_scaling.utils.ScaleTrackingInterpreter", "generated/unit_scaling.utils.analyse_module", "generated/unit_scaling.visualiser", "index", "limitations", "posts/almost_scaled_dot_product_attention", "user_guide"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["api_reference.rst", "generated/unit_scaling.rst", "generated/unit_scaling.Conv1d.rst", "generated/unit_scaling.CrossEntropyLoss.rst", "generated/unit_scaling.DepthModuleList.rst", "generated/unit_scaling.DepthSequential.rst", "generated/unit_scaling.Dropout.rst", "generated/unit_scaling.Embedding.rst", "generated/unit_scaling.GELU.rst", "generated/unit_scaling.LayerNorm.rst", "generated/unit_scaling.Linear.rst", "generated/unit_scaling.LinearReadout.rst", "generated/unit_scaling.MHSA.rst", "generated/unit_scaling.MLP.rst", "generated/unit_scaling.Parameter.rst", "generated/unit_scaling.RMSNorm.rst", "generated/unit_scaling.SiLU.rst", "generated/unit_scaling.Softmax.rst", "generated/unit_scaling.TransformerDecoder.rst", "generated/unit_scaling.TransformerLayer.rst", "generated/unit_scaling.analysis.rst", "generated/unit_scaling.analysis.example_batch.rst", "generated/unit_scaling.analysis.graph_to_dataframe.rst", "generated/unit_scaling.analysis.plot.rst", "generated/unit_scaling.analysis.visualiser.rst", "generated/unit_scaling.constraints.rst", "generated/unit_scaling.constraints.amean.rst", "generated/unit_scaling.constraints.apply_constraint.rst", "generated/unit_scaling.constraints.gmean.rst", "generated/unit_scaling.constraints.hmean.rst", "generated/unit_scaling.constraints.to_grad_input_scale.rst", "generated/unit_scaling.constraints.to_left_grad_scale.rst", "generated/unit_scaling.constraints.to_output_scale.rst", "generated/unit_scaling.constraints.to_right_grad_scale.rst", "generated/unit_scaling.core.rst", "generated/unit_scaling.core.functional.rst", "generated/unit_scaling.core.functional.logarithmic_interpolation.rst", "generated/unit_scaling.core.functional.rms.rst", "generated/unit_scaling.core.functional.scale_elementwise.rst", "generated/unit_scaling.core.functional.transformer_residual_scaling_rule.rst", "generated/unit_scaling.formats.rst", "generated/unit_scaling.formats.FPFormat.rst", "generated/unit_scaling.formats.format_to_tuple.rst", "generated/unit_scaling.formats.tuple_to_format.rst", "generated/unit_scaling.functional.rst", "generated/unit_scaling.functional.add.rst", "generated/unit_scaling.functional.conv1d.rst", "generated/unit_scaling.functional.cross_entropy.rst", "generated/unit_scaling.functional.dropout.rst", "generated/unit_scaling.functional.embedding.rst", "generated/unit_scaling.functional.gelu.rst", "generated/unit_scaling.functional.layer_norm.rst", "generated/unit_scaling.functional.linear.rst", "generated/unit_scaling.functional.linear_readout.rst", "generated/unit_scaling.functional.matmul.rst", "generated/unit_scaling.functional.mse_loss.rst", "generated/unit_scaling.functional.residual_add.rst", "generated/unit_scaling.functional.residual_apply.rst", "generated/unit_scaling.functional.residual_split.rst", "generated/unit_scaling.functional.rms_norm.rst", "generated/unit_scaling.functional.scaled_dot_product_attention.rst", "generated/unit_scaling.functional.silu.rst", "generated/unit_scaling.functional.silu_glu.rst", "generated/unit_scaling.functional.softmax.rst", "generated/unit_scaling.optim.rst", "generated/unit_scaling.optim.Adam.rst", "generated/unit_scaling.optim.AdamW.rst", "generated/unit_scaling.optim.SGD.rst", "generated/unit_scaling.optim.lr_scale_for_depth.rst", "generated/unit_scaling.optim.lr_scale_func_adam.rst", "generated/unit_scaling.optim.lr_scale_func_sgd.rst", "generated/unit_scaling.optim.scaled_parameters.rst", "generated/unit_scaling.parameter.rst", "generated/unit_scaling.parameter.OrderedDict.rst", "generated/unit_scaling.parameter.Parameter.rst", "generated/unit_scaling.parameter.ParameterData.rst", "generated/unit_scaling.parameter.Protocol.rst", "generated/unit_scaling.parameter.Tensor.rst", "generated/unit_scaling.parameter.has_parameter_data.rst", "generated/unit_scaling.scale.rst", "generated/unit_scaling.scale.scale_bwd.rst", "generated/unit_scaling.scale.scale_fwd.rst", "generated/unit_scaling.transformer_residual_scaling_rule.rst", "generated/unit_scaling.transforms.rst", "generated/unit_scaling.transforms.Metrics.rst", "generated/unit_scaling.transforms.compile.rst", "generated/unit_scaling.transforms.prune_non_float_tensors.rst", "generated/unit_scaling.transforms.prune_same_scale_tensors.rst", "generated/unit_scaling.transforms.prune_selected_nodes.rst", "generated/unit_scaling.transforms.simulate_format.rst", "generated/unit_scaling.transforms.simulate_fp8.rst", "generated/unit_scaling.transforms.track_scales.rst", "generated/unit_scaling.transforms.unit_scale.rst", "generated/unit_scaling.transforms.utils.rst", "generated/unit_scaling.transforms.utils.apply_transform.rst", "generated/unit_scaling.transforms.utils.patch_to_expand_modules.rst", "generated/unit_scaling.transforms.utils.replace_node_with_function.rst", "generated/unit_scaling.transforms.utils.torch_nn_modules_to_user_modules.rst", "generated/unit_scaling.utils.rst", "generated/unit_scaling.utils.ScalePair.rst", "generated/unit_scaling.utils.ScaleTracker.rst", "generated/unit_scaling.utils.ScaleTrackingInterpreter.rst", "generated/unit_scaling.utils.analyse_module.rst", "generated/unit_scaling.visualiser.rst", "index.rst", "limitations.rst", "posts/almost_scaled_dot_product_attention.md", "user_guide.rst"], "indexentries": {"abs() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.abs", false]], "abs_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.abs_", false]], "absolute() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.absolute", false]], "absolute_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.absolute_", false]], "acos() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.acos", false]], "acos_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.acos_", false]], "acosh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.acosh", false]], "acosh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.acosh_", false]], "adam (class in unit_scaling.optim)": [[65, "unit_scaling.optim.Adam", false]], "adamw (class in unit_scaling.optim)": [[66, "unit_scaling.optim.AdamW", false]], "add() (in module unit_scaling.functional)": [[45, "unit_scaling.functional.add", false]], "add() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.add", false]], "add_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.add_", false]], "add_param_group() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.add_param_group", false]], "add_param_group() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.add_param_group", false]], "add_param_group() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.add_param_group", false]], "addbmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addbmm", false]], "addbmm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addbmm_", false]], "addcdiv() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addcdiv", false]], "addcdiv_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addcdiv_", false]], "addcmul() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addcmul", false]], "addcmul_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addcmul_", false]], "addmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addmm", false]], "addmm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addmm_", false]], "addmv() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addmv", false]], "addmv_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addmv_", false]], "addr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addr", false]], "addr_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.addr_", false]], "adjoint() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.adjoint", false]], "align_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.align_as", false]], "align_to() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.align_to", false]], "all() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.all", false]], "allclose() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.allclose", false]], "amax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.amax", false]], "amean() (in module unit_scaling.constraints)": [[26, "unit_scaling.constraints.amean", false]], "amin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.amin", false]], "aminmax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.aminmax", false]], "analyse_module() (in module unit_scaling.utils)": [[102, "unit_scaling.utils.analyse_module", false]], "angle() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.angle", false]], "any() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.any", false]], "append() (unit_scaling.depthmodulelist method)": [[4, "unit_scaling.DepthModuleList.append", false]], "append() (unit_scaling.depthsequential method)": [[5, "unit_scaling.DepthSequential.append", false]], "append() (unit_scaling.transformerdecoder method)": [[18, "unit_scaling.TransformerDecoder.append", false]], "apply_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.apply_", false]], "apply_constraint() (in module unit_scaling.constraints)": [[27, "unit_scaling.constraints.apply_constraint", false]], "apply_transform() (in module unit_scaling.transforms.utils)": [[94, "unit_scaling.transforms.utils.apply_transform", false]], "arccos() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arccos", false]], "arccos_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arccos_", false]], "arccosh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arccosh", false]], "arccosh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arccosh_", false]], "arcsin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arcsin", false]], "arcsin_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arcsin_", false]], "arcsinh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arcsinh", false]], "arcsinh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arcsinh_", false]], "arctan() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctan", false]], "arctan2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctan2", false]], "arctan2_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctan2_", false]], "arctan_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctan_", false]], "arctanh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctanh", false]], "arctanh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.arctanh_", false]], "argmax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.argmax", false]], "argmin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.argmin", false]], "argsort() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.argsort", false]], "argwhere() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.argwhere", false]], "as_strided() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.as_strided", false]], "as_strided_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.as_strided_", false]], "as_strided_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.as_strided_scatter", false]], "as_subclass() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.as_subclass", false]], "asin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.asin", false]], "asin_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.asin_", false]], "asinh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.asinh", false]], "asinh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.asinh_", false]], "atan() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atan", false]], "atan2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atan2", false]], "atan2_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atan2_", false]], "atan_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atan_", false]], "atanh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atanh", false]], "atanh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.atanh_", false]], "backward() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.backward", false]], "backward() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.backward", false]], "baddbmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.baddbmm", false]], "baddbmm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.baddbmm_", false]], "bernoulli() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bernoulli", false]], "bernoulli_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bernoulli_", false]], "bfloat16() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bfloat16", false]], "bias (unit_scaling.conv1d attribute)": [[2, "unit_scaling.Conv1d.bias", false]], "bias (unit_scaling.layernorm attribute)": [[9, "unit_scaling.LayerNorm.bias", false]], "bias (unit_scaling.linear attribute)": [[10, "unit_scaling.Linear.bias", false]], "bias (unit_scaling.linearreadout attribute)": [[11, "unit_scaling.LinearReadout.bias", false]], "bincount() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bincount", false]], "bits (unit_scaling.formats.fpformat property)": [[41, "unit_scaling.formats.FPFormat.bits", false]], "bitwise_and() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_and", false]], "bitwise_and_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_and_", false]], "bitwise_left_shift() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_left_shift", false]], "bitwise_left_shift_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_left_shift_", false]], "bitwise_not() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_not", false]], "bitwise_not_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_not_", false]], "bitwise_or() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_or", false]], "bitwise_or_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_or_", false]], "bitwise_right_shift() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_right_shift", false]], "bitwise_right_shift_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_right_shift_", false]], "bitwise_xor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_xor", false]], "bitwise_xor_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bitwise_xor_", false]], "bmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bmm", false]], "bool() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.bool", false]], "boxed_run() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.boxed_run", false]], "broadcast_to() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.broadcast_to", false]], "byte() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.byte", false]], "call_function() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.call_function", false]], "call_method() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.call_method", false]], "call_module() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.call_module", false]], "cauchy_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cauchy_", false]], "cdouble() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cdouble", false]], "ceil() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ceil", false]], "ceil_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ceil_", false]], "cfloat() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cfloat", false]], "chalf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.chalf", false]], "char() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.char", false]], "cholesky() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cholesky", false]], "cholesky_inverse() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cholesky_inverse", false]], "cholesky_solve() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cholesky_solve", false]], "chunk() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.chunk", false]], "clamp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clamp", false]], "clamp_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clamp_", false]], "clear() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.clear", false]], "clip() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clip", false]], "clip_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clip_", false]], "clone() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.clone", false]], "coalesce() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.coalesce", false]], "col_indices() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.col_indices", false]], "compile() (in module unit_scaling.transforms)": [[85, "unit_scaling.transforms.compile", false]], "conj() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.conj", false]], "conj_physical() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.conj_physical", false]], "conj_physical_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.conj_physical_", false]], "contiguous() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.contiguous", false]], "conv1d (class in unit_scaling)": [[2, "unit_scaling.Conv1d", false]], "conv1d() (in module unit_scaling.functional)": [[46, "unit_scaling.functional.conv1d", false]], "copy() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.copy", false]], "copy_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.copy_", false]], "copysign() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.copysign", false]], "copysign_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.copysign_", false]], "corrcoef() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.corrcoef", false]], "cos() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cos", false]], "cos_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cos_", false]], "cosh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cosh", false]], "cosh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cosh_", false]], "count_nonzero() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.count_nonzero", false]], "cov() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cov", false]], "cpu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cpu", false]], "cross() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cross", false]], "cross_entropy() (in module unit_scaling.functional)": [[47, "unit_scaling.functional.cross_entropy", false]], "crossentropyloss (class in unit_scaling)": [[3, "unit_scaling.CrossEntropyLoss", false]], "crow_indices() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.crow_indices", false]], "cuda() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cuda", false]], "cummax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cummax", false]], "cummin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cummin", false]], "cumprod() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cumprod", false]], "cumprod_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cumprod_", false]], "cumsum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cumsum", false]], "cumsum_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.cumsum_", false]], "data_ptr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.data_ptr", false]], "deg2rad() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.deg2rad", false]], "deg2rad_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.deg2rad_", false]], "dense_dim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dense_dim", false]], "depthmodulelist (class in unit_scaling)": [[4, "unit_scaling.DepthModuleList", false]], "depthsequential (class in unit_scaling)": [[5, "unit_scaling.DepthSequential", false]], "dequantize() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dequantize", false]], "det() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.det", false]], "detach() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.detach", false]], "detach_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.detach_", false]], "device (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.device", false]], "diag() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diag", false]], "diag_embed() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diag_embed", false]], "diagflat() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diagflat", false]], "diagonal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diagonal", false]], "diagonal_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diagonal_scatter", false]], "diff() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.diff", false]], "digamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.digamma", false]], "digamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.digamma_", false]], "dim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dim", false]], "dim_order() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dim_order", false]], "dist() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dist", false]], "div() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.div", false]], "div_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.div_", false]], "divide() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.divide", false]], "divide_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.divide_", false]], "dot() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dot", false]], "double() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.double", false]], "dropout (class in unit_scaling)": [[6, "unit_scaling.Dropout", false]], "dropout() (in module unit_scaling.functional)": [[48, "unit_scaling.functional.dropout", false]], "dsplit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.dsplit", false]], "element_size() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.element_size", false]], "embedding (class in unit_scaling)": [[7, "unit_scaling.Embedding", false]], "embedding() (in module unit_scaling.functional)": [[49, "unit_scaling.functional.embedding", false]], "eq() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.eq", false]], "eq_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.eq_", false]], "equal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.equal", false]], "erf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erf", false]], "erf_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erf_", false]], "erfc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erfc", false]], "erfc_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erfc_", false]], "erfinv() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erfinv", false]], "erfinv_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.erfinv_", false]], "example_batch() (in module unit_scaling.analysis)": [[21, "unit_scaling.analysis.example_batch", false]], "exp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exp", false]], "exp2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exp2", false]], "exp2_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exp2_", false]], "exp_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exp_", false]], "expand() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.expand", false]], "expand_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.expand_as", false]], "expm1() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.expm1", false]], "expm1_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.expm1_", false]], "exponential_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.exponential_", false]], "extend() (unit_scaling.depthmodulelist method)": [[4, "unit_scaling.DepthModuleList.extend", false]], "fetch_args_kwargs_from_env() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.fetch_args_kwargs_from_env", false]], "fetch_attr() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.fetch_attr", false]], "fill_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fill_", false]], "fill_diagonal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fill_diagonal_", false]], "fix() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fix", false]], "fix_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fix_", false]], "flatten() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.flatten", false]], "flip() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.flip", false]], "fliplr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fliplr", false]], "flipud() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.flipud", false]], "float() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.float", false]], "float_power() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.float_power", false]], "float_power_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.float_power_", false]], "floor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.floor", false]], "floor_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.floor_", false]], "floor_divide() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.floor_divide", false]], "floor_divide_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.floor_divide_", false]], "fmax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fmax", false]], "fmin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fmin", false]], "fmod() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fmod", false]], "fmod_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.fmod_", false]], "format_to_tuple() (in module unit_scaling.formats)": [[42, "unit_scaling.formats.format_to_tuple", false]], "fpformat (class in unit_scaling.formats)": [[41, "unit_scaling.formats.FPFormat", false]], "frac() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.frac", false]], "frac_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.frac_", false]], "frexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.frexp", false]], "from_pretrained() (unit_scaling.embedding class method)": [[7, "unit_scaling.Embedding.from_pretrained", false]], "fromkeys() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.fromkeys", false]], "gather() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gather", false]], "gcd() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gcd", false]], "gcd_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gcd_", false]], "ge() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ge", false]], "ge_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ge_", false]], "gelu (class in unit_scaling)": [[8, "unit_scaling.GELU", false]], "gelu() (in module unit_scaling.functional)": [[50, "unit_scaling.functional.gelu", false]], "geometric_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.geometric_", false]], "geqrf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.geqrf", false]], "ger() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ger", false]], "get() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.get", false]], "get_attr() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.get_attr", false]], "get_device() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.get_device", false]], "gmean() (in module unit_scaling.constraints)": [[28, "unit_scaling.constraints.gmean", false]], "grad (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.grad", false]], "graph_to_dataframe() (in module unit_scaling.analysis)": [[22, "unit_scaling.analysis.graph_to_dataframe", false]], "greater() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.greater", false]], "greater_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.greater_", false]], "greater_equal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.greater_equal", false]], "greater_equal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.greater_equal_", false]], "gt() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gt", false]], "gt_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.gt_", false]], "h (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.H", false]], "half() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.half", false]], "hardshrink() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.hardshrink", false]], "has_names() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.has_names", false]], "has_parameter_data() (in module unit_scaling.parameter)": [[78, "unit_scaling.parameter.has_parameter_data", false]], "heaviside() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.heaviside", false]], "heaviside_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.heaviside_", false]], "histc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.histc", false]], "histogram() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.histogram", false]], "hmean() (in module unit_scaling.constraints)": [[29, "unit_scaling.constraints.hmean", false]], "hsplit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.hsplit", false]], "hypot() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.hypot", false]], "hypot_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.hypot_", false]], "i0() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.i0", false]], "i0_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.i0_", false]], "igamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.igamma", false]], "igamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.igamma_", false]], "igammac() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.igammac", false]], "igammac_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.igammac_", false]], "imag (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.imag", false]], "index_add() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_add", false]], "index_add_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_add_", false]], "index_copy() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_copy", false]], "index_copy_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_copy_", false]], "index_fill() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_fill", false]], "index_fill_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_fill_", false]], "index_put() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_put", false]], "index_put_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_put_", false]], "index_reduce_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_reduce_", false]], "index_select() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.index_select", false]], "indices() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.indices", false]], "inner() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.inner", false]], "insert() (unit_scaling.depthmodulelist method)": [[4, "unit_scaling.DepthModuleList.insert", false]], "int() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.int", false]], "int_repr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.int_repr", false]], "inverse() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.inverse", false]], "ipu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ipu", false]], "is_coalesced() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_coalesced", false]], "is_complex() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_complex", false]], "is_conj() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_conj", false]], "is_contiguous() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_contiguous", false]], "is_cpu (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_cpu", false]], "is_cuda (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_cuda", false]], "is_floating_point() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_floating_point", false]], "is_inference() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_inference", false]], "is_ipu (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_ipu", false]], "is_leaf (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_leaf", false]], "is_meta (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_meta", false]], "is_mps (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_mps", false]], "is_neg() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_neg", false]], "is_pinned() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_pinned", false]], "is_quantized (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_quantized", false]], "is_set_to() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_set_to", false]], "is_shared() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_shared", false]], "is_signed() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.is_signed", false]], "is_sparse (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_sparse", false]], "is_sparse_csr (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_sparse_csr", false]], "is_xla (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_xla", false]], "is_xpu (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.is_xpu", false]], "isclose() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isclose", false]], "isfinite() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isfinite", false]], "isinf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isinf", false]], "isnan() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isnan", false]], "isneginf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isneginf", false]], "isposinf() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isposinf", false]], "isreal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.isreal", false]], "istft() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.istft", false]], "item() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.item", false]], "items() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.items", false]], "itemsize (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.itemsize", false]], "jvp() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.jvp", false]], "keys() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.keys", false]], "kron() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.kron", false]], "kthvalue() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.kthvalue", false]], "layer_norm() (in module unit_scaling.functional)": [[51, "unit_scaling.functional.layer_norm", false]], "layernorm (class in unit_scaling)": [[9, "unit_scaling.LayerNorm", false]], "lcm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lcm", false]], "lcm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lcm_", false]], "ldexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ldexp", false]], "ldexp_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ldexp_", false]], "le() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.le", false]], "le_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.le_", false]], "lerp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lerp", false]], "lerp_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lerp_", false]], "less() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.less", false]], "less_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.less_", false]], "less_equal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.less_equal", false]], "less_equal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.less_equal_", false]], "lgamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lgamma", false]], "lgamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lgamma_", false]], "linear (class in unit_scaling)": [[10, "unit_scaling.Linear", false]], "linear() (in module unit_scaling.functional)": [[52, "unit_scaling.functional.linear", false]], "linear_readout() (in module unit_scaling.functional)": [[53, "unit_scaling.functional.linear_readout", false]], "linearreadout (class in unit_scaling)": [[11, "unit_scaling.LinearReadout", false]], "load_state_dict() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.load_state_dict", false]], "load_state_dict() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.load_state_dict", false]], "load_state_dict() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.load_state_dict", false]], "log() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log", false]], "log10() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log10", false]], "log10_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log10_", false]], "log1p() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log1p", false]], "log1p_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log1p_", false]], "log2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log2", false]], "log2_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log2_", false]], "log_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log_", false]], "log_normal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.log_normal_", false]], "logaddexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logaddexp", false]], "logaddexp2() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logaddexp2", false]], "logarithmic_interpolation() (in module unit_scaling.core.functional)": [[36, "unit_scaling.core.functional.logarithmic_interpolation", false]], "logcumsumexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logcumsumexp", false]], "logdet() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logdet", false]], "logical_and() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_and", false]], "logical_and_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_and_", false]], "logical_not() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_not", false]], "logical_not_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_not_", false]], "logical_or() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_or", false]], "logical_or_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_or_", false]], "logical_xor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_xor", false]], "logical_xor_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logical_xor_", false]], "logit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logit", false]], "logit_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logit_", false]], "logsumexp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.logsumexp", false]], "long() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.long", false]], "lr_scale_for_depth() (in module unit_scaling.optim)": [[68, "unit_scaling.optim.lr_scale_for_depth", false]], "lr_scale_func_adam() (in module unit_scaling.optim)": [[69, "unit_scaling.optim.lr_scale_func_adam", false]], "lr_scale_func_sgd() (in module unit_scaling.optim)": [[70, "unit_scaling.optim.lr_scale_func_sgd", false]], "lt() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lt", false]], "lt_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lt_", false]], "lu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lu", false]], "lu_solve() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.lu_solve", false]], "map_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.map_", false]], "map_nodes_to_values() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.map_nodes_to_values", false]], "mark_dirty() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.mark_dirty", false]], "mark_non_differentiable() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.mark_non_differentiable", false]], "masked_fill() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_fill", false]], "masked_fill_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_fill_", false]], "masked_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_scatter", false]], "masked_scatter_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_scatter_", false]], "masked_select() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.masked_select", false]], "matmul() (in module unit_scaling.functional)": [[54, "unit_scaling.functional.matmul", false]], "matmul() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.matmul", false]], "matrix_exp() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.matrix_exp", false]], "matrix_power() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.matrix_power", false]], "max() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.max", false]], "max_absolute_value (unit_scaling.formats.fpformat property)": [[41, "unit_scaling.formats.FPFormat.max_absolute_value", false]], "maximum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.maximum", false]], "mean() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mean", false]], "median() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.median", false]], "metrics (class in unit_scaling.transforms)": [[84, "unit_scaling.transforms.Metrics", false]], "metrics.data (class in unit_scaling.transforms)": [[84, "unit_scaling.transforms.Metrics.Data", false]], "mh (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.mH", false]], "mhsa (class in unit_scaling)": [[12, "unit_scaling.MHSA", false]], "min() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.min", false]], "min_absolute_normal (unit_scaling.formats.fpformat property)": [[41, "unit_scaling.formats.FPFormat.min_absolute_normal", false]], "min_absolute_subnormal (unit_scaling.formats.fpformat property)": [[41, "unit_scaling.formats.FPFormat.min_absolute_subnormal", false]], "minimum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.minimum", false]], "mlp (class in unit_scaling)": [[13, "unit_scaling.MLP", false]], "mm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mm", false]], "mode() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mode", false]], "module": [[1, "module-unit_scaling", false], [20, "module-unit_scaling.analysis", false], [25, "module-unit_scaling.constraints", false], [34, "module-unit_scaling.core", false], [35, "module-unit_scaling.core.functional", false], [40, "module-unit_scaling.formats", false], [44, "module-unit_scaling.functional", false], [64, "module-unit_scaling.optim", false], [72, "module-unit_scaling.parameter", false], [79, "module-unit_scaling.scale", false], [83, "module-unit_scaling.transforms", false], [93, "module-unit_scaling.transforms.utils", false], [98, "module-unit_scaling.utils", false]], "module_load() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.module_load", false]], "move_to_end() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.move_to_end", false]], "moveaxis() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.moveaxis", false]], "movedim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.movedim", false]], "mse_loss() (in module unit_scaling.functional)": [[55, "unit_scaling.functional.mse_loss", false]], "msort() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.msort", false]], "mt (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.mT", false]], "mtia() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mtia", false]], "mul() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mul", false]], "mul_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mul_", false]], "multinomial() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.multinomial", false]], "multiply() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.multiply", false]], "multiply_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.multiply_", false]], "mv() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mv", false]], "mvlgamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mvlgamma", false]], "mvlgamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.mvlgamma_", false]], "names (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.names", false]], "nan_to_num() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nan_to_num", false]], "nan_to_num_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nan_to_num_", false]], "nanmean() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nanmean", false]], "nanmedian() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nanmedian", false]], "nanquantile() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nanquantile", false]], "nansum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nansum", false]], "narrow() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.narrow", false]], "narrow_copy() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.narrow_copy", false]], "nbytes (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.nbytes", false]], "ndim (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.ndim", false]], "ndimension() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ndimension", false]], "ne() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ne", false]], "ne_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ne_", false]], "neg() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.neg", false]], "neg_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.neg_", false]], "negative() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.negative", false]], "negative_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.negative_", false]], "nelement() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nelement", false]], "new_empty() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_empty", false]], "new_empty_strided() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_empty_strided", false]], "new_full() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_full", false]], "new_ones() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_ones", false]], "new_tensor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_tensor", false]], "new_zeros() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.new_zeros", false]], "nextafter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nextafter", false]], "nextafter_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nextafter_", false]], "nonzero() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nonzero", false]], "nonzero_static() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.nonzero_static", false]], "norm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.norm", false]], "normal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.normal_", false]], "not_equal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.not_equal", false]], "not_equal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.not_equal_", false]], "numel() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.numel", false]], "numpy() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.numpy", false]], "ordereddict (class in unit_scaling.parameter)": [[73, "unit_scaling.parameter.OrderedDict", false]], "orgqr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.orgqr", false]], "ormqr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ormqr", false]], "outer() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.outer", false]], "output() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.output", false]], "parameter() (in module unit_scaling)": [[14, "unit_scaling.Parameter", false]], "parameter() (in module unit_scaling.parameter)": [[74, "unit_scaling.parameter.Parameter", false]], "parameterdata (class in unit_scaling.parameter)": [[75, "unit_scaling.parameter.ParameterData", false]], "patch_to_expand_modules() (in module unit_scaling.transforms.utils)": [[95, "unit_scaling.transforms.utils.patch_to_expand_modules", false]], "permute() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.permute", false]], "pin_memory() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.pin_memory", false]], "pinverse() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.pinverse", false]], "placeholder() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.placeholder", false]], "plot() (in module unit_scaling.analysis)": [[23, "unit_scaling.analysis.plot", false]], "polygamma() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.polygamma", false]], "polygamma_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.polygamma_", false]], "pop() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.pop", false]], "popitem() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.popitem", false]], "positive() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.positive", false]], "pow() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.pow", false]], "pow_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.pow_", false]], "prod() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.prod", false]], "protocol (class in unit_scaling.parameter)": [[76, "unit_scaling.parameter.Protocol", false]], "prune_non_float_tensors() (in module unit_scaling.transforms)": [[86, "unit_scaling.transforms.prune_non_float_tensors", false]], "prune_same_scale_tensors() (in module unit_scaling.transforms)": [[87, "unit_scaling.transforms.prune_same_scale_tensors", false]], "prune_selected_nodes() (in module unit_scaling.transforms)": [[88, "unit_scaling.transforms.prune_selected_nodes", false]], "put() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.put", false]], "put_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.put_", false]], "q_per_channel_axis() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_per_channel_axis", false]], "q_per_channel_scales() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_per_channel_scales", false]], "q_per_channel_zero_points() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_per_channel_zero_points", false]], "q_scale() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_scale", false]], "q_zero_point() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.q_zero_point", false]], "qr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.qr", false]], "qscheme() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.qscheme", false]], "quantile() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.quantile", false]], "quantise() (unit_scaling.formats.fpformat method)": [[41, "unit_scaling.formats.FPFormat.quantise", false]], "quantise_bwd() (unit_scaling.formats.fpformat method)": [[41, "unit_scaling.formats.FPFormat.quantise_bwd", false]], "quantise_fwd() (unit_scaling.formats.fpformat method)": [[41, "unit_scaling.formats.FPFormat.quantise_fwd", false]], "rad2deg() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rad2deg", false]], "rad2deg_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rad2deg_", false]], "random_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.random_", false]], "ravel() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.ravel", false]], "real (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.real", false]], "reciprocal() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.reciprocal", false]], "reciprocal_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.reciprocal_", false]], "record_stream() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.record_stream", false]], "refine_names() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.refine_names", false]], "register_hook() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.register_hook", false]], "register_load_state_dict_post_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_load_state_dict_post_hook", false]], "register_load_state_dict_post_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_load_state_dict_post_hook", false]], "register_load_state_dict_post_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_load_state_dict_post_hook", false]], "register_load_state_dict_pre_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_load_state_dict_pre_hook", false]], "register_load_state_dict_pre_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_load_state_dict_pre_hook", false]], "register_load_state_dict_pre_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_load_state_dict_pre_hook", false]], "register_post_accumulate_grad_hook() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.register_post_accumulate_grad_hook", false]], "register_state_dict_post_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_state_dict_post_hook", false]], "register_state_dict_post_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_state_dict_post_hook", false]], "register_state_dict_post_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_state_dict_post_hook", false]], "register_state_dict_pre_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_state_dict_pre_hook", false]], "register_state_dict_pre_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_state_dict_pre_hook", false]], "register_state_dict_pre_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_state_dict_pre_hook", false]], "register_step_post_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_step_post_hook", false]], "register_step_post_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_step_post_hook", false]], "register_step_post_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_step_post_hook", false]], "register_step_pre_hook() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.register_step_pre_hook", false]], "register_step_pre_hook() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.register_step_pre_hook", false]], "register_step_pre_hook() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.register_step_pre_hook", false]], "remainder() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.remainder", false]], "remainder_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.remainder_", false]], "rename() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rename", false]], "rename_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rename_", false]], "renorm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.renorm", false]], "renorm_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.renorm_", false]], "repeat() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.repeat", false]], "repeat_interleave() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.repeat_interleave", false]], "replace_node_with_function() (in module unit_scaling.transforms.utils)": [[96, "unit_scaling.transforms.utils.replace_node_with_function", false]], "requires_grad (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.requires_grad", false]], "requires_grad_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.requires_grad_", false]], "reshape() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.reshape", false]], "reshape_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.reshape_as", false]], "residual_add() (in module unit_scaling.functional)": [[56, "unit_scaling.functional.residual_add", false]], "residual_apply() (in module unit_scaling.functional)": [[57, "unit_scaling.functional.residual_apply", false]], "residual_split() (in module unit_scaling.functional)": [[58, "unit_scaling.functional.residual_split", false]], "resize_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.resize_", false]], "resize_as_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.resize_as_", false]], "resolve_conj() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.resolve_conj", false]], "resolve_neg() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.resolve_neg", false]], "retain_grad() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.retain_grad", false]], "retains_grad (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.retains_grad", false]], "rms() (in module unit_scaling.core.functional)": [[37, "unit_scaling.core.functional.rms", false]], "rms_norm() (in module unit_scaling.functional)": [[59, "unit_scaling.functional.rms_norm", false]], "rmsnorm (class in unit_scaling)": [[15, "unit_scaling.RMSNorm", false]], "roll() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.roll", false]], "rot90() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rot90", false]], "round() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.round", false]], "round_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.round_", false]], "rsqrt() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rsqrt", false]], "rsqrt_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.rsqrt_", false]], "run() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.run", false]], "run_node() (unit_scaling.utils.scaletrackinginterpreter method)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter.run_node", false]], "save_for_backward() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.save_for_backward", false]], "save_for_forward() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.save_for_forward", false]], "scale_bwd() (in module unit_scaling.scale)": [[80, "unit_scaling.scale.scale_bwd", false]], "scale_elementwise() (in module unit_scaling.core.functional)": [[38, "unit_scaling.core.functional.scale_elementwise", false]], "scale_fwd() (in module unit_scaling.scale)": [[81, "unit_scaling.scale.scale_fwd", false]], "scaled_dot_product_attention() (in module unit_scaling.functional)": [[60, "unit_scaling.functional.scaled_dot_product_attention", false]], "scaled_parameters() (in module unit_scaling.optim)": [[71, "unit_scaling.optim.scaled_parameters", false]], "scalepair (class in unit_scaling.utils)": [[99, "unit_scaling.utils.ScalePair", false]], "scaletracker (class in unit_scaling.utils)": [[100, "unit_scaling.utils.ScaleTracker", false]], "scaletrackinginterpreter (class in unit_scaling.utils)": [[101, "unit_scaling.utils.ScaleTrackingInterpreter", false]], "scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter", false]], "scatter_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_", false]], "scatter_add() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_add", false]], "scatter_add_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_add_", false]], "scatter_reduce() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_reduce", false]], "scatter_reduce_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.scatter_reduce_", false]], "select() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.select", false]], "select_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.select_scatter", false]], "set_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.set_", false]], "set_materialize_grads() (unit_scaling.utils.scaletracker method)": [[100, "unit_scaling.utils.ScaleTracker.set_materialize_grads", false]], "setdefault() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.setdefault", false]], "setup_context() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.setup_context", false]], "sgd (class in unit_scaling.optim)": [[67, "unit_scaling.optim.SGD", false]], "sgn() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sgn", false]], "sgn_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sgn_", false]], "shape (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.shape", false]], "share_memory_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.share_memory_", false]], "short() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.short", false]], "sigmoid() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sigmoid", false]], "sigmoid_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sigmoid_", false]], "sign() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sign", false]], "sign_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sign_", false]], "signbit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.signbit", false]], "silu (class in unit_scaling)": [[16, "unit_scaling.SiLU", false]], "silu() (in module unit_scaling.functional)": [[61, "unit_scaling.functional.silu", false]], "silu_glu() (in module unit_scaling.functional)": [[62, "unit_scaling.functional.silu_glu", false]], "simulate_format() (in module unit_scaling.transforms)": [[89, "unit_scaling.transforms.simulate_format", false]], "simulate_fp8() (in module unit_scaling.transforms)": [[90, "unit_scaling.transforms.simulate_fp8", false]], "sin() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sin", false]], "sin_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sin_", false]], "sinc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sinc", false]], "sinc_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sinc_", false]], "sinh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sinh", false]], "sinh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sinh_", false]], "size() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.size", false]], "slice_scatter() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.slice_scatter", false]], "slogdet() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.slogdet", false]], "smm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.smm", false]], "softmax (class in unit_scaling)": [[17, "unit_scaling.Softmax", false]], "softmax() (in module unit_scaling.functional)": [[63, "unit_scaling.functional.softmax", false]], "softmax() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.softmax", false]], "sort() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sort", false]], "sparse_dim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sparse_dim", false]], "sparse_mask() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sparse_mask", false]], "sparse_resize_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sparse_resize_", false]], "sparse_resize_and_clear_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sparse_resize_and_clear_", false]], "split() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.split", false]], "sqrt() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sqrt", false]], "sqrt_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sqrt_", false]], "square() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.square", false]], "square_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.square_", false]], "squeeze() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.squeeze", false]], "squeeze_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.squeeze_", false]], "sspaddmm() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sspaddmm", false]], "state_dict() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.state_dict", false]], "state_dict() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.state_dict", false]], "state_dict() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.state_dict", false]], "std() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.std", false]], "step() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.step", false]], "step() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.step", false]], "step() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.step", false]], "stft() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.stft", false]], "storage() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.storage", false]], "storage_offset() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.storage_offset", false]], "storage_type() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.storage_type", false]], "stride() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.stride", false]], "sub() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sub", false]], "sub_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sub_", false]], "subtract() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.subtract", false]], "subtract_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.subtract_", false]], "sum() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sum", false]], "sum_to_size() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.sum_to_size", false]], "svd() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.svd", false]], "swapaxes() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.swapaxes", false]], "swapaxes_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.swapaxes_", false]], "swapdims() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.swapdims", false]], "swapdims_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.swapdims_", false]], "t (unit_scaling.parameter.tensor attribute)": [[77, "unit_scaling.parameter.Tensor.T", false]], "t() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.t", false]], "t_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.t_", false]], "take() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.take", false]], "take_along_dim() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.take_along_dim", false]], "tan() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tan", false]], "tan_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tan_", false]], "tanh() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tanh", false]], "tanh_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tanh_", false]], "tensor (class in unit_scaling.parameter)": [[77, "unit_scaling.parameter.Tensor", false]], "tensor_split() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tensor_split", false]], "tile() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tile", false]], "to() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to", false]], "to_dense() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_dense", false]], "to_grad_input_scale() (in module unit_scaling.constraints)": [[30, "unit_scaling.constraints.to_grad_input_scale", false]], "to_left_grad_scale() (in module unit_scaling.constraints)": [[31, "unit_scaling.constraints.to_left_grad_scale", false]], "to_mkldnn() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_mkldnn", false]], "to_output_scale() (in module unit_scaling.constraints)": [[32, "unit_scaling.constraints.to_output_scale", false]], "to_padded_tensor() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_padded_tensor", false]], "to_right_grad_scale() (in module unit_scaling.constraints)": [[33, "unit_scaling.constraints.to_right_grad_scale", false]], "to_sparse() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse", false]], "to_sparse_bsc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_bsc", false]], "to_sparse_bsr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_bsr", false]], "to_sparse_coo() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_coo", false]], "to_sparse_csc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_csc", false]], "to_sparse_csr() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.to_sparse_csr", false]], "tolist() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tolist", false]], "topk() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.topk", false]], "torch_nn_modules_to_user_modules() (in module unit_scaling.transforms.utils)": [[97, "unit_scaling.transforms.utils.torch_nn_modules_to_user_modules", false]], "trace() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.trace", false]], "track_scales() (in module unit_scaling.transforms)": [[91, "unit_scaling.transforms.track_scales", false]], "transformer_residual_scaling_rule() (in module unit_scaling)": [[82, "unit_scaling.transformer_residual_scaling_rule", false]], "transformer_residual_scaling_rule() (in module unit_scaling.core.functional)": [[39, "unit_scaling.core.functional.transformer_residual_scaling_rule", false]], "transformerdecoder (class in unit_scaling)": [[18, "unit_scaling.TransformerDecoder", false]], "transformerlayer (class in unit_scaling)": [[19, "unit_scaling.TransformerLayer", false]], "transpose() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.transpose", false]], "transpose_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.transpose_", false]], "triangular_solve() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.triangular_solve", false]], "tril() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tril", false]], "tril_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.tril_", false]], "triu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.triu", false]], "triu_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.triu_", false]], "true_divide() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.true_divide", false]], "true_divide_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.true_divide_", false]], "trunc() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.trunc", false]], "trunc_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.trunc_", false]], "tuple_to_format() (in module unit_scaling.formats)": [[43, "unit_scaling.formats.tuple_to_format", false]], "type() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.type", false]], "type_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.type_as", false]], "unbind() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unbind", false]], "unflatten() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unflatten", false]], "unfold() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unfold", false]], "uniform_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.uniform_", false]], "unique() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unique", false]], "unique_consecutive() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unique_consecutive", false]], "unit_scale() (in module unit_scaling.transforms)": [[92, "unit_scaling.transforms.unit_scale", false]], "unit_scaling": [[1, "module-unit_scaling", false]], "unit_scaling.analysis": [[20, "module-unit_scaling.analysis", false]], "unit_scaling.constraints": [[25, "module-unit_scaling.constraints", false]], "unit_scaling.core": [[34, "module-unit_scaling.core", false]], "unit_scaling.core.functional": [[35, "module-unit_scaling.core.functional", false]], "unit_scaling.formats": [[40, "module-unit_scaling.formats", false]], "unit_scaling.functional": [[44, "module-unit_scaling.functional", false]], "unit_scaling.optim": [[64, "module-unit_scaling.optim", false]], "unit_scaling.parameter": [[72, "module-unit_scaling.parameter", false]], "unit_scaling.scale": [[79, "module-unit_scaling.scale", false]], "unit_scaling.transforms": [[83, "module-unit_scaling.transforms", false]], "unit_scaling.transforms.utils": [[93, "module-unit_scaling.transforms.utils", false]], "unit_scaling.utils": [[98, "module-unit_scaling.utils", false]], "unsafe_chunk() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unsafe_chunk", false]], "unsafe_split() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unsafe_split", false]], "unsqueeze() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unsqueeze", false]], "unsqueeze_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.unsqueeze_", false]], "untyped_storage() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.untyped_storage", false]], "update() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.update", false]], "values() (unit_scaling.parameter.ordereddict method)": [[73, "unit_scaling.parameter.OrderedDict.values", false]], "values() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.values", false]], "var() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.var", false]], "vdot() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.vdot", false]], "view() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.view", false]], "view_as() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.view_as", false]], "visualiser() (in module unit_scaling)": [[103, "unit_scaling.visualiser", false]], "visualiser() (in module unit_scaling.analysis)": [[24, "unit_scaling.analysis.visualiser", false]], "vjp() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.vjp", false]], "vmap() (unit_scaling.utils.scaletracker static method)": [[100, "unit_scaling.utils.ScaleTracker.vmap", false]], "vsplit() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.vsplit", false]], "weight (unit_scaling.conv1d attribute)": [[2, "unit_scaling.Conv1d.weight", false]], "weight (unit_scaling.embedding attribute)": [[7, "unit_scaling.Embedding.weight", false]], "weight (unit_scaling.layernorm attribute)": [[9, "unit_scaling.LayerNorm.weight", false]], "weight (unit_scaling.linear attribute)": [[10, "unit_scaling.Linear.weight", false]], "weight (unit_scaling.linearreadout attribute)": [[11, "unit_scaling.LinearReadout.weight", false]], "weight (unit_scaling.rmsnorm attribute)": [[15, "unit_scaling.RMSNorm.weight", false]], "where() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.where", false]], "xlogy() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.xlogy", false]], "xlogy_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.xlogy_", false]], "xpu() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.xpu", false]], "zero_() (unit_scaling.parameter.tensor method)": [[77, "unit_scaling.parameter.Tensor.zero_", false]], "zero_grad() (unit_scaling.optim.adam method)": [[65, "unit_scaling.optim.Adam.zero_grad", false]], "zero_grad() (unit_scaling.optim.adamw method)": [[66, "unit_scaling.optim.AdamW.zero_grad", false]], "zero_grad() (unit_scaling.optim.sgd method)": [[67, "unit_scaling.optim.SGD.zero_grad", false]]}, "objects": {"": [[1, 0, 0, "-", "unit_scaling"]], "unit_scaling": [[2, 1, 1, "", "Conv1d"], [3, 1, 1, "", "CrossEntropyLoss"], [4, 1, 1, "", "DepthModuleList"], [5, 1, 1, "", "DepthSequential"], [6, 1, 1, "", "Dropout"], [7, 1, 1, "", "Embedding"], [8, 1, 1, "", "GELU"], [9, 1, 1, "", "LayerNorm"], [10, 1, 1, "", "Linear"], [11, 1, 1, "", "LinearReadout"], [12, 1, 1, "", "MHSA"], [13, 1, 1, "", "MLP"], [14, 4, 1, "", "Parameter"], [15, 1, 1, "", "RMSNorm"], [16, 1, 1, "", "SiLU"], [17, 1, 1, "", "Softmax"], [18, 1, 1, "", "TransformerDecoder"], [19, 1, 1, "", "TransformerLayer"], [20, 0, 0, "-", "analysis"], [25, 0, 0, "-", "constraints"], [34, 0, 0, "-", "core"], [40, 0, 0, "-", "formats"], [44, 0, 0, "-", "functional"], [64, 0, 0, "-", "optim"], [72, 0, 0, "-", "parameter"], [79, 0, 0, "-", "scale"], [82, 4, 1, "", "transformer_residual_scaling_rule"], [83, 0, 0, "-", "transforms"], [98, 0, 0, "-", "utils"], [103, 4, 1, "", "visualiser"]], "unit_scaling.Conv1d": [[2, 2, 1, "", "bias"], [2, 2, 1, "", "weight"]], "unit_scaling.DepthModuleList": [[4, 3, 1, "", "append"], [4, 3, 1, "", "extend"], [4, 3, 1, "", "insert"]], "unit_scaling.DepthSequential": [[5, 3, 1, "", "append"]], "unit_scaling.Embedding": [[7, 3, 1, "", "from_pretrained"], [7, 2, 1, "", "weight"]], "unit_scaling.LayerNorm": [[9, 2, 1, "", "bias"], [9, 2, 1, "", "weight"]], "unit_scaling.Linear": [[10, 2, 1, "", "bias"], [10, 2, 1, "", "weight"]], "unit_scaling.LinearReadout": [[11, 2, 1, "", "bias"], [11, 2, 1, "", "weight"]], "unit_scaling.RMSNorm": [[15, 2, 1, "", "weight"]], "unit_scaling.TransformerDecoder": [[18, 3, 1, "", "append"]], "unit_scaling.analysis": [[21, 4, 1, "", "example_batch"], [22, 4, 1, "", "graph_to_dataframe"], [23, 4, 1, "", "plot"], [24, 4, 1, "", "visualiser"]], "unit_scaling.constraints": [[26, 4, 1, "", "amean"], [27, 4, 1, "", "apply_constraint"], [28, 4, 1, "", "gmean"], [29, 4, 1, "", "hmean"], [30, 4, 1, "", "to_grad_input_scale"], [31, 4, 1, "", "to_left_grad_scale"], [32, 4, 1, "", "to_output_scale"], [33, 4, 1, "", "to_right_grad_scale"]], "unit_scaling.core": [[35, 0, 0, "-", "functional"]], "unit_scaling.core.functional": [[36, 4, 1, "", "logarithmic_interpolation"], [37, 4, 1, "", "rms"], [38, 4, 1, "", "scale_elementwise"], [39, 4, 1, "", "transformer_residual_scaling_rule"]], "unit_scaling.formats": [[41, 1, 1, "", "FPFormat"], [42, 4, 1, "", "format_to_tuple"], [43, 4, 1, "", "tuple_to_format"]], "unit_scaling.formats.FPFormat": [[41, 5, 1, "", "bits"], [41, 5, 1, "", "max_absolute_value"], [41, 5, 1, "", "min_absolute_normal"], [41, 5, 1, "", "min_absolute_subnormal"], [41, 3, 1, "", "quantise"], [41, 3, 1, "", "quantise_bwd"], [41, 3, 1, "", "quantise_fwd"]], "unit_scaling.functional": [[45, 4, 1, "", "add"], [46, 4, 1, "", "conv1d"], [47, 4, 1, "", "cross_entropy"], [48, 4, 1, "", "dropout"], [49, 4, 1, "", "embedding"], [50, 4, 1, "", "gelu"], [51, 4, 1, "", "layer_norm"], [52, 4, 1, "", "linear"], [53, 4, 1, "", "linear_readout"], [54, 4, 1, "", "matmul"], [55, 4, 1, "", "mse_loss"], [56, 4, 1, "", "residual_add"], [57, 4, 1, "", "residual_apply"], [58, 4, 1, "", "residual_split"], [59, 4, 1, "", "rms_norm"], [60, 4, 1, "", "scaled_dot_product_attention"], [61, 4, 1, "", "silu"], [62, 4, 1, "", "silu_glu"], [63, 4, 1, "", "softmax"]], "unit_scaling.optim": [[65, 1, 1, "", "Adam"], [66, 1, 1, "", "AdamW"], [67, 1, 1, "", "SGD"], [68, 4, 1, "", "lr_scale_for_depth"], [69, 4, 1, "", "lr_scale_func_adam"], [70, 4, 1, "", "lr_scale_func_sgd"], [71, 4, 1, "", "scaled_parameters"]], "unit_scaling.optim.Adam": [[65, 3, 1, "", "add_param_group"], [65, 3, 1, "", "load_state_dict"], [65, 3, 1, "", "register_load_state_dict_post_hook"], [65, 3, 1, "", "register_load_state_dict_pre_hook"], [65, 3, 1, "", "register_state_dict_post_hook"], [65, 3, 1, "", "register_state_dict_pre_hook"], [65, 3, 1, "", "register_step_post_hook"], [65, 3, 1, "", "register_step_pre_hook"], [65, 3, 1, "", "state_dict"], [65, 3, 1, "", "step"], [65, 3, 1, "", "zero_grad"]], "unit_scaling.optim.AdamW": [[66, 3, 1, "", "add_param_group"], [66, 3, 1, "", "load_state_dict"], [66, 3, 1, "", "register_load_state_dict_post_hook"], [66, 3, 1, "", "register_load_state_dict_pre_hook"], [66, 3, 1, "", "register_state_dict_post_hook"], [66, 3, 1, "", "register_state_dict_pre_hook"], [66, 3, 1, "", "register_step_post_hook"], [66, 3, 1, "", "register_step_pre_hook"], [66, 3, 1, "", "state_dict"], [66, 3, 1, "", "step"], [66, 3, 1, "", "zero_grad"]], "unit_scaling.optim.SGD": [[67, 3, 1, "", "add_param_group"], [67, 3, 1, "", "load_state_dict"], [67, 3, 1, "", "register_load_state_dict_post_hook"], [67, 3, 1, "", "register_load_state_dict_pre_hook"], [67, 3, 1, "", "register_state_dict_post_hook"], [67, 3, 1, "", "register_state_dict_pre_hook"], [67, 3, 1, "", "register_step_post_hook"], [67, 3, 1, "", "register_step_pre_hook"], [67, 3, 1, "", "state_dict"], [67, 3, 1, "", "step"], [67, 3, 1, "", "zero_grad"]], "unit_scaling.parameter": [[73, 1, 1, "", "OrderedDict"], [74, 4, 1, "", "Parameter"], [75, 1, 1, "", "ParameterData"], [76, 1, 1, "", "Protocol"], [77, 1, 1, "", "Tensor"], [78, 4, 1, "", "has_parameter_data"]], "unit_scaling.parameter.OrderedDict": [[73, 3, 1, "", "clear"], [73, 3, 1, "", "copy"], [73, 3, 1, "", "fromkeys"], [73, 3, 1, "", "get"], [73, 3, 1, "", "items"], [73, 3, 1, "", "keys"], [73, 3, 1, "", "move_to_end"], [73, 3, 1, "", "pop"], [73, 3, 1, "", "popitem"], [73, 3, 1, "", "setdefault"], [73, 3, 1, "", "update"], [73, 3, 1, "", "values"]], "unit_scaling.parameter.Tensor": [[77, 2, 1, "", "H"], [77, 2, 1, "", "T"], [77, 3, 1, "", "abs"], [77, 3, 1, "", "abs_"], [77, 3, 1, "", "absolute"], [77, 3, 1, "", "absolute_"], [77, 3, 1, "", "acos"], [77, 3, 1, "", "acos_"], [77, 3, 1, "", "acosh"], [77, 3, 1, "", "acosh_"], [77, 3, 1, "", "add"], [77, 3, 1, "", "add_"], [77, 3, 1, "", "addbmm"], [77, 3, 1, "", "addbmm_"], [77, 3, 1, "", "addcdiv"], [77, 3, 1, "", "addcdiv_"], [77, 3, 1, "", "addcmul"], [77, 3, 1, "", "addcmul_"], [77, 3, 1, "", "addmm"], [77, 3, 1, "", "addmm_"], [77, 3, 1, "", "addmv"], [77, 3, 1, "", "addmv_"], [77, 3, 1, "", "addr"], [77, 3, 1, "", "addr_"], [77, 3, 1, "", "adjoint"], [77, 3, 1, "", "align_as"], [77, 3, 1, "", "align_to"], [77, 3, 1, "", "all"], [77, 3, 1, "", "allclose"], [77, 3, 1, "", "amax"], [77, 3, 1, "", "amin"], [77, 3, 1, "", "aminmax"], [77, 3, 1, "", "angle"], [77, 3, 1, "", "any"], [77, 3, 1, "", "apply_"], [77, 3, 1, "", "arccos"], [77, 3, 1, "", "arccos_"], [77, 3, 1, "", "arccosh"], [77, 3, 1, "", "arccosh_"], [77, 3, 1, "", "arcsin"], [77, 3, 1, "", "arcsin_"], [77, 3, 1, "", "arcsinh"], [77, 3, 1, "", "arcsinh_"], [77, 3, 1, "", "arctan"], [77, 3, 1, "", "arctan2"], [77, 3, 1, "", "arctan2_"], [77, 3, 1, "", "arctan_"], [77, 3, 1, "", "arctanh"], [77, 3, 1, "", "arctanh_"], [77, 3, 1, "", "argmax"], [77, 3, 1, "", "argmin"], [77, 3, 1, "", "argsort"], [77, 3, 1, "", "argwhere"], [77, 3, 1, "", "as_strided"], [77, 3, 1, "", "as_strided_"], [77, 3, 1, "", "as_strided_scatter"], [77, 3, 1, "", "as_subclass"], [77, 3, 1, "", "asin"], [77, 3, 1, "", "asin_"], [77, 3, 1, "", "asinh"], [77, 3, 1, "", "asinh_"], [77, 3, 1, "", "atan"], [77, 3, 1, "", "atan2"], [77, 3, 1, "", "atan2_"], [77, 3, 1, "", "atan_"], [77, 3, 1, "", "atanh"], [77, 3, 1, "", "atanh_"], [77, 3, 1, "", "backward"], [77, 3, 1, "", "baddbmm"], [77, 3, 1, "", "baddbmm_"], [77, 3, 1, "", "bernoulli"], [77, 3, 1, "", "bernoulli_"], [77, 3, 1, "", "bfloat16"], [77, 3, 1, "", "bincount"], [77, 3, 1, "", "bitwise_and"], [77, 3, 1, "", "bitwise_and_"], [77, 3, 1, "", "bitwise_left_shift"], [77, 3, 1, "", "bitwise_left_shift_"], [77, 3, 1, "", "bitwise_not"], [77, 3, 1, "", "bitwise_not_"], [77, 3, 1, "", "bitwise_or"], [77, 3, 1, "", "bitwise_or_"], [77, 3, 1, "", "bitwise_right_shift"], [77, 3, 1, "", "bitwise_right_shift_"], [77, 3, 1, "", "bitwise_xor"], [77, 3, 1, "", "bitwise_xor_"], [77, 3, 1, "", "bmm"], [77, 3, 1, "", "bool"], [77, 3, 1, "", "broadcast_to"], [77, 3, 1, "", "byte"], [77, 3, 1, "", "cauchy_"], [77, 3, 1, "", "cdouble"], [77, 3, 1, "", "ceil"], [77, 3, 1, "", "ceil_"], [77, 3, 1, "", "cfloat"], [77, 3, 1, "", "chalf"], [77, 3, 1, "", "char"], [77, 3, 1, "", "cholesky"], [77, 3, 1, "", "cholesky_inverse"], [77, 3, 1, "", "cholesky_solve"], [77, 3, 1, "", "chunk"], [77, 3, 1, "", "clamp"], [77, 3, 1, "", "clamp_"], [77, 3, 1, "", "clip"], [77, 3, 1, "", "clip_"], [77, 3, 1, "", "clone"], [77, 3, 1, "", "coalesce"], [77, 3, 1, "", "col_indices"], [77, 3, 1, "", "conj"], [77, 3, 1, "", "conj_physical"], [77, 3, 1, "", "conj_physical_"], [77, 3, 1, "", "contiguous"], [77, 3, 1, "", "copy_"], [77, 3, 1, "", "copysign"], [77, 3, 1, "", "copysign_"], [77, 3, 1, "", "corrcoef"], [77, 3, 1, "", "cos"], [77, 3, 1, "", "cos_"], [77, 3, 1, "", "cosh"], [77, 3, 1, "", "cosh_"], [77, 3, 1, "", "count_nonzero"], [77, 3, 1, "", "cov"], [77, 3, 1, "", "cpu"], [77, 3, 1, "", "cross"], [77, 3, 1, "", "crow_indices"], [77, 3, 1, "", "cuda"], [77, 3, 1, "", "cummax"], [77, 3, 1, "", "cummin"], [77, 3, 1, "", "cumprod"], [77, 3, 1, "", "cumprod_"], [77, 3, 1, "", "cumsum"], [77, 3, 1, "", "cumsum_"], [77, 3, 1, "", "data_ptr"], [77, 3, 1, "", "deg2rad"], [77, 3, 1, "", "deg2rad_"], [77, 3, 1, "", "dense_dim"], [77, 3, 1, "", "dequantize"], [77, 3, 1, "", "det"], [77, 3, 1, "", "detach"], [77, 3, 1, "", "detach_"], [77, 2, 1, "", "device"], [77, 3, 1, "", "diag"], [77, 3, 1, "", "diag_embed"], [77, 3, 1, "", "diagflat"], [77, 3, 1, "", "diagonal"], [77, 3, 1, "", "diagonal_scatter"], [77, 3, 1, "", "diff"], [77, 3, 1, "", "digamma"], [77, 3, 1, "", "digamma_"], [77, 3, 1, "", "dim"], [77, 3, 1, "", "dim_order"], [77, 3, 1, "", "dist"], [77, 3, 1, "", "div"], [77, 3, 1, "", "div_"], [77, 3, 1, "", "divide"], [77, 3, 1, "", "divide_"], [77, 3, 1, "", "dot"], [77, 3, 1, "", "double"], [77, 3, 1, "", "dsplit"], [77, 3, 1, "", "element_size"], [77, 3, 1, "", "eq"], [77, 3, 1, "", "eq_"], [77, 3, 1, "", "equal"], [77, 3, 1, "", "erf"], [77, 3, 1, "", "erf_"], [77, 3, 1, "", "erfc"], [77, 3, 1, "", "erfc_"], [77, 3, 1, "", "erfinv"], [77, 3, 1, "", "erfinv_"], [77, 3, 1, "", "exp"], [77, 3, 1, "", "exp2"], [77, 3, 1, "", "exp2_"], [77, 3, 1, "", "exp_"], [77, 3, 1, "", "expand"], [77, 3, 1, "", "expand_as"], [77, 3, 1, "", "expm1"], [77, 3, 1, "", "expm1_"], [77, 3, 1, "", "exponential_"], [77, 3, 1, "", "fill_"], [77, 3, 1, "", "fill_diagonal_"], [77, 3, 1, "", "fix"], [77, 3, 1, "", "fix_"], [77, 3, 1, "", "flatten"], [77, 3, 1, "", "flip"], [77, 3, 1, "", "fliplr"], [77, 3, 1, "", "flipud"], [77, 3, 1, "", "float"], [77, 3, 1, "", "float_power"], [77, 3, 1, "", "float_power_"], [77, 3, 1, "", "floor"], [77, 3, 1, "", "floor_"], [77, 3, 1, "", "floor_divide"], [77, 3, 1, "", "floor_divide_"], [77, 3, 1, "", "fmax"], [77, 3, 1, "", "fmin"], [77, 3, 1, "", "fmod"], [77, 3, 1, "", "fmod_"], [77, 3, 1, "", "frac"], [77, 3, 1, "", "frac_"], [77, 3, 1, "", "frexp"], [77, 3, 1, "", "gather"], [77, 3, 1, "", "gcd"], [77, 3, 1, "", "gcd_"], [77, 3, 1, "", "ge"], [77, 3, 1, "", "ge_"], [77, 3, 1, "", "geometric_"], [77, 3, 1, "", "geqrf"], [77, 3, 1, "", "ger"], [77, 3, 1, "", "get_device"], [77, 2, 1, "", "grad"], [77, 3, 1, "", "greater"], [77, 3, 1, "", "greater_"], [77, 3, 1, "", "greater_equal"], [77, 3, 1, "", "greater_equal_"], [77, 3, 1, "", "gt"], [77, 3, 1, "", "gt_"], [77, 3, 1, "", "half"], [77, 3, 1, "", "hardshrink"], [77, 3, 1, "", "has_names"], [77, 3, 1, "", "heaviside"], [77, 3, 1, "", "heaviside_"], [77, 3, 1, "", "histc"], [77, 3, 1, "", "histogram"], [77, 3, 1, "", "hsplit"], [77, 3, 1, "", "hypot"], [77, 3, 1, "", "hypot_"], [77, 3, 1, "", "i0"], [77, 3, 1, "", "i0_"], [77, 3, 1, "", "igamma"], [77, 3, 1, "", "igamma_"], [77, 3, 1, "", "igammac"], [77, 3, 1, "", "igammac_"], [77, 2, 1, "", "imag"], [77, 3, 1, "", "index_add"], [77, 3, 1, "", "index_add_"], [77, 3, 1, "", "index_copy"], [77, 3, 1, "", "index_copy_"], [77, 3, 1, "", "index_fill"], [77, 3, 1, "", "index_fill_"], [77, 3, 1, "", "index_put"], [77, 3, 1, "", "index_put_"], [77, 3, 1, "", "index_reduce_"], [77, 3, 1, "", "index_select"], [77, 3, 1, "", "indices"], [77, 3, 1, "", "inner"], [77, 3, 1, "", "int"], [77, 3, 1, "", "int_repr"], [77, 3, 1, "", "inverse"], [77, 3, 1, "", "ipu"], [77, 3, 1, "", "is_coalesced"], [77, 3, 1, "", "is_complex"], [77, 3, 1, "", "is_conj"], [77, 3, 1, "", "is_contiguous"], [77, 2, 1, "", "is_cpu"], [77, 2, 1, "", "is_cuda"], [77, 3, 1, "", "is_floating_point"], [77, 3, 1, "", "is_inference"], [77, 2, 1, "", "is_ipu"], [77, 2, 1, "", "is_leaf"], [77, 2, 1, "", "is_meta"], [77, 2, 1, "", "is_mps"], [77, 3, 1, "", "is_neg"], [77, 3, 1, "", "is_pinned"], [77, 2, 1, "", "is_quantized"], [77, 3, 1, "", "is_set_to"], [77, 3, 1, "", "is_shared"], [77, 3, 1, "", "is_signed"], [77, 2, 1, "", "is_sparse"], [77, 2, 1, "", "is_sparse_csr"], [77, 2, 1, "", "is_xla"], [77, 2, 1, "", "is_xpu"], [77, 3, 1, "", "isclose"], [77, 3, 1, "", "isfinite"], [77, 3, 1, "", "isinf"], [77, 3, 1, "", "isnan"], [77, 3, 1, "", "isneginf"], [77, 3, 1, "", "isposinf"], [77, 3, 1, "", "isreal"], [77, 3, 1, "", "istft"], [77, 3, 1, "", "item"], [77, 2, 1, "", "itemsize"], [77, 3, 1, "", "kron"], [77, 3, 1, "", "kthvalue"], [77, 3, 1, "", "lcm"], [77, 3, 1, "", "lcm_"], [77, 3, 1, "", "ldexp"], [77, 3, 1, "", "ldexp_"], [77, 3, 1, "", "le"], [77, 3, 1, "", "le_"], [77, 3, 1, "", "lerp"], [77, 3, 1, "", "lerp_"], [77, 3, 1, "", "less"], [77, 3, 1, "", "less_"], [77, 3, 1, "", "less_equal"], [77, 3, 1, "", "less_equal_"], [77, 3, 1, "", "lgamma"], [77, 3, 1, "", "lgamma_"], [77, 3, 1, "", "log"], [77, 3, 1, "", "log10"], [77, 3, 1, "", "log10_"], [77, 3, 1, "", "log1p"], [77, 3, 1, "", "log1p_"], [77, 3, 1, "", "log2"], [77, 3, 1, "", "log2_"], [77, 3, 1, "", "log_"], [77, 3, 1, "", "log_normal_"], [77, 3, 1, "", "logaddexp"], [77, 3, 1, "", "logaddexp2"], [77, 3, 1, "", "logcumsumexp"], [77, 3, 1, "", "logdet"], [77, 3, 1, "", "logical_and"], [77, 3, 1, "", "logical_and_"], [77, 3, 1, "", "logical_not"], [77, 3, 1, "", "logical_not_"], [77, 3, 1, "", "logical_or"], [77, 3, 1, "", "logical_or_"], [77, 3, 1, "", "logical_xor"], [77, 3, 1, "", "logical_xor_"], [77, 3, 1, "", "logit"], [77, 3, 1, "", "logit_"], [77, 3, 1, "", "logsumexp"], [77, 3, 1, "", "long"], [77, 3, 1, "", "lt"], [77, 3, 1, "", "lt_"], [77, 3, 1, "", "lu"], [77, 3, 1, "", "lu_solve"], [77, 2, 1, "", "mH"], [77, 2, 1, "", "mT"], [77, 3, 1, "", "map_"], [77, 3, 1, "", "masked_fill"], [77, 3, 1, "", "masked_fill_"], [77, 3, 1, "", "masked_scatter"], [77, 3, 1, "", "masked_scatter_"], [77, 3, 1, "", "masked_select"], [77, 3, 1, "", "matmul"], [77, 3, 1, "", "matrix_exp"], [77, 3, 1, "", "matrix_power"], [77, 3, 1, "", "max"], [77, 3, 1, "", "maximum"], [77, 3, 1, "", "mean"], [77, 3, 1, "", "median"], [77, 3, 1, "", "min"], [77, 3, 1, "", "minimum"], [77, 3, 1, "", "mm"], [77, 3, 1, "", "mode"], [77, 3, 1, "", "module_load"], [77, 3, 1, "", "moveaxis"], [77, 3, 1, "", "movedim"], [77, 3, 1, "", "msort"], [77, 3, 1, "", "mtia"], [77, 3, 1, "", "mul"], [77, 3, 1, "", "mul_"], [77, 3, 1, "", "multinomial"], [77, 3, 1, "", "multiply"], [77, 3, 1, "", "multiply_"], [77, 3, 1, "", "mv"], [77, 3, 1, "", "mvlgamma"], [77, 3, 1, "", "mvlgamma_"], [77, 2, 1, "", "names"], [77, 3, 1, "", "nan_to_num"], [77, 3, 1, "", "nan_to_num_"], [77, 3, 1, "", "nanmean"], [77, 3, 1, "", "nanmedian"], [77, 3, 1, "", "nanquantile"], [77, 3, 1, "", "nansum"], [77, 3, 1, "", "narrow"], [77, 3, 1, "", "narrow_copy"], [77, 2, 1, "", "nbytes"], [77, 2, 1, "", "ndim"], [77, 3, 1, "", "ndimension"], [77, 3, 1, "", "ne"], [77, 3, 1, "", "ne_"], [77, 3, 1, "", "neg"], [77, 3, 1, "", "neg_"], [77, 3, 1, "", "negative"], [77, 3, 1, "", "negative_"], [77, 3, 1, "", "nelement"], [77, 3, 1, "", "new_empty"], [77, 3, 1, "", "new_empty_strided"], [77, 3, 1, "", "new_full"], [77, 3, 1, "", "new_ones"], [77, 3, 1, "", "new_tensor"], [77, 3, 1, "", "new_zeros"], [77, 3, 1, "", "nextafter"], [77, 3, 1, "", "nextafter_"], [77, 3, 1, "", "nonzero"], [77, 3, 1, "", "nonzero_static"], [77, 3, 1, "", "norm"], [77, 3, 1, "", "normal_"], [77, 3, 1, "", "not_equal"], [77, 3, 1, "", "not_equal_"], [77, 3, 1, "", "numel"], [77, 3, 1, "", "numpy"], [77, 3, 1, "", "orgqr"], [77, 3, 1, "", "ormqr"], [77, 3, 1, "", "outer"], [77, 3, 1, "", "permute"], [77, 3, 1, "", "pin_memory"], [77, 3, 1, "", "pinverse"], [77, 3, 1, "", "polygamma"], [77, 3, 1, "", "polygamma_"], [77, 3, 1, "", "positive"], [77, 3, 1, "", "pow"], [77, 3, 1, "", "pow_"], [77, 3, 1, "", "prod"], [77, 3, 1, "", "put"], [77, 3, 1, "", "put_"], [77, 3, 1, "", "q_per_channel_axis"], [77, 3, 1, "", "q_per_channel_scales"], [77, 3, 1, "", "q_per_channel_zero_points"], [77, 3, 1, "", "q_scale"], [77, 3, 1, "", "q_zero_point"], [77, 3, 1, "", "qr"], [77, 3, 1, "", "qscheme"], [77, 3, 1, "", "quantile"], [77, 3, 1, "", "rad2deg"], [77, 3, 1, "", "rad2deg_"], [77, 3, 1, "", "random_"], [77, 3, 1, "", "ravel"], [77, 2, 1, "", "real"], [77, 3, 1, "", "reciprocal"], [77, 3, 1, "", "reciprocal_"], [77, 3, 1, "", "record_stream"], [77, 3, 1, "", "refine_names"], [77, 3, 1, "", "register_hook"], [77, 3, 1, "", "register_post_accumulate_grad_hook"], [77, 3, 1, "", "remainder"], [77, 3, 1, "", "remainder_"], [77, 3, 1, "", "rename"], [77, 3, 1, "", "rename_"], [77, 3, 1, "", "renorm"], [77, 3, 1, "", "renorm_"], [77, 3, 1, "", "repeat"], [77, 3, 1, "", "repeat_interleave"], [77, 2, 1, "", "requires_grad"], [77, 3, 1, "", "requires_grad_"], [77, 3, 1, "", "reshape"], [77, 3, 1, "", "reshape_as"], [77, 3, 1, "", "resize_"], [77, 3, 1, "", "resize_as_"], [77, 3, 1, "", "resolve_conj"], [77, 3, 1, "", "resolve_neg"], [77, 3, 1, "", "retain_grad"], [77, 2, 1, "", "retains_grad"], [77, 3, 1, "", "roll"], [77, 3, 1, "", "rot90"], [77, 3, 1, "", "round"], [77, 3, 1, "", "round_"], [77, 3, 1, "", "rsqrt"], [77, 3, 1, "", "rsqrt_"], [77, 3, 1, "", "scatter"], [77, 3, 1, "", "scatter_"], [77, 3, 1, "", "scatter_add"], [77, 3, 1, "", "scatter_add_"], [77, 3, 1, "", "scatter_reduce"], [77, 3, 1, "", "scatter_reduce_"], [77, 3, 1, "", "select"], [77, 3, 1, "", "select_scatter"], [77, 3, 1, "", "set_"], [77, 3, 1, "", "sgn"], [77, 3, 1, "", "sgn_"], [77, 2, 1, "", "shape"], [77, 3, 1, "", "share_memory_"], [77, 3, 1, "", "short"], [77, 3, 1, "", "sigmoid"], [77, 3, 1, "", "sigmoid_"], [77, 3, 1, "", "sign"], [77, 3, 1, "", "sign_"], [77, 3, 1, "", "signbit"], [77, 3, 1, "", "sin"], [77, 3, 1, "", "sin_"], [77, 3, 1, "", "sinc"], [77, 3, 1, "", "sinc_"], [77, 3, 1, "", "sinh"], [77, 3, 1, "", "sinh_"], [77, 3, 1, "", "size"], [77, 3, 1, "", "slice_scatter"], [77, 3, 1, "", "slogdet"], [77, 3, 1, "", "smm"], [77, 3, 1, "", "softmax"], [77, 3, 1, "", "sort"], [77, 3, 1, "", "sparse_dim"], [77, 3, 1, "", "sparse_mask"], [77, 3, 1, "", "sparse_resize_"], [77, 3, 1, "", "sparse_resize_and_clear_"], [77, 3, 1, "", "split"], [77, 3, 1, "", "sqrt"], [77, 3, 1, "", "sqrt_"], [77, 3, 1, "", "square"], [77, 3, 1, "", "square_"], [77, 3, 1, "", "squeeze"], [77, 3, 1, "", "squeeze_"], [77, 3, 1, "", "sspaddmm"], [77, 3, 1, "", "std"], [77, 3, 1, "", "stft"], [77, 3, 1, "", "storage"], [77, 3, 1, "", "storage_offset"], [77, 3, 1, "", "storage_type"], [77, 3, 1, "", "stride"], [77, 3, 1, "", "sub"], [77, 3, 1, "", "sub_"], [77, 3, 1, "", "subtract"], [77, 3, 1, "", "subtract_"], [77, 3, 1, "", "sum"], [77, 3, 1, "", "sum_to_size"], [77, 3, 1, "", "svd"], [77, 3, 1, "", "swapaxes"], [77, 3, 1, "", "swapaxes_"], [77, 3, 1, "", "swapdims"], [77, 3, 1, "", "swapdims_"], [77, 3, 1, "", "t"], [77, 3, 1, "", "t_"], [77, 3, 1, "", "take"], [77, 3, 1, "", "take_along_dim"], [77, 3, 1, "", "tan"], [77, 3, 1, "", "tan_"], [77, 3, 1, "", "tanh"], [77, 3, 1, "", "tanh_"], [77, 3, 1, "", "tensor_split"], [77, 3, 1, "", "tile"], [77, 3, 1, "", "to"], [77, 3, 1, "", "to_dense"], [77, 3, 1, "", "to_mkldnn"], [77, 3, 1, "", "to_padded_tensor"], [77, 3, 1, "", "to_sparse"], [77, 3, 1, "", "to_sparse_bsc"], [77, 3, 1, "", "to_sparse_bsr"], [77, 3, 1, "", "to_sparse_coo"], [77, 3, 1, "", "to_sparse_csc"], [77, 3, 1, "", "to_sparse_csr"], [77, 3, 1, "", "tolist"], [77, 3, 1, "", "topk"], [77, 3, 1, "", "trace"], [77, 3, 1, "", "transpose"], [77, 3, 1, "", "transpose_"], [77, 3, 1, "", "triangular_solve"], [77, 3, 1, "", "tril"], [77, 3, 1, "", "tril_"], [77, 3, 1, "", "triu"], [77, 3, 1, "", "triu_"], [77, 3, 1, "", "true_divide"], [77, 3, 1, "", "true_divide_"], [77, 3, 1, "", "trunc"], [77, 3, 1, "", "trunc_"], [77, 3, 1, "", "type"], [77, 3, 1, "", "type_as"], [77, 3, 1, "", "unbind"], [77, 3, 1, "", "unflatten"], [77, 3, 1, "", "unfold"], [77, 3, 1, "", "uniform_"], [77, 3, 1, "", "unique"], [77, 3, 1, "", "unique_consecutive"], [77, 3, 1, "", "unsafe_chunk"], [77, 3, 1, "", "unsafe_split"], [77, 3, 1, "", "unsqueeze"], [77, 3, 1, "", "unsqueeze_"], [77, 3, 1, "", "untyped_storage"], [77, 3, 1, "", "values"], [77, 3, 1, "", "var"], [77, 3, 1, "", "vdot"], [77, 3, 1, "", "view"], [77, 3, 1, "", "view_as"], [77, 3, 1, "", "vsplit"], [77, 3, 1, "", "where"], [77, 3, 1, "", "xlogy"], [77, 3, 1, "", "xlogy_"], [77, 3, 1, "", "xpu"], [77, 3, 1, "", "zero_"]], "unit_scaling.scale": [[80, 4, 1, "", "scale_bwd"], [81, 4, 1, "", "scale_fwd"]], "unit_scaling.transforms": [[84, 1, 1, "", "Metrics"], [85, 4, 1, "", "compile"], [86, 4, 1, "", "prune_non_float_tensors"], [87, 4, 1, "", "prune_same_scale_tensors"], [88, 4, 1, "", "prune_selected_nodes"], [89, 4, 1, "", "simulate_format"], [90, 4, 1, "", "simulate_fp8"], [91, 4, 1, "", "track_scales"], [92, 4, 1, "", "unit_scale"], [93, 0, 0, "-", "utils"]], "unit_scaling.transforms.Metrics": [[84, 1, 1, "", "Data"]], "unit_scaling.transforms.utils": [[94, 4, 1, "", "apply_transform"], [95, 4, 1, "", "patch_to_expand_modules"], [96, 4, 1, "", "replace_node_with_function"], [97, 4, 1, "", "torch_nn_modules_to_user_modules"]], "unit_scaling.utils": [[99, 1, 1, "", "ScalePair"], [100, 1, 1, "", "ScaleTracker"], [101, 1, 1, "", "ScaleTrackingInterpreter"], [102, 4, 1, "", "analyse_module"]], "unit_scaling.utils.ScaleTracker": [[100, 3, 1, "", "backward"], [100, 3, 1, "", "jvp"], [100, 3, 1, "", "mark_dirty"], [100, 3, 1, "", "mark_non_differentiable"], [100, 3, 1, "", "save_for_backward"], [100, 3, 1, "", "save_for_forward"], [100, 3, 1, "", "set_materialize_grads"], [100, 3, 1, "", "setup_context"], [100, 3, 1, "", "vjp"], [100, 3, 1, "", "vmap"]], "unit_scaling.utils.ScaleTrackingInterpreter": [[101, 3, 1, "", "boxed_run"], [101, 3, 1, "", "call_function"], [101, 3, 1, "", "call_method"], [101, 3, 1, "", "call_module"], [101, 3, 1, "", "fetch_args_kwargs_from_env"], [101, 3, 1, "", "fetch_attr"], [101, 3, 1, "", "get_attr"], [101, 3, 1, "", "map_nodes_to_values"], [101, 3, 1, "", "output"], [101, 3, 1, "", "placeholder"], [101, 3, 1, "", "run"], [101, 3, 1, "", "run_node"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "method", "Python method"], "4": ["py", "function", "Python function"], "5": ["py", "property", "Python property"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute", "3": "py:method", "4": "py:function", "5": "py:property"}, "terms": {"": [5, 13, 30, 31, 32, 33, 52, 53, 54, 60, 65, 66, 67, 73, 77, 91, 100, 101, 102, 107], "0": [2, 3, 6, 7, 8, 9, 12, 16, 17, 18, 19, 36, 37, 39, 41, 45, 46, 47, 48, 49, 50, 52, 56, 57, 58, 60, 61, 62, 63, 65, 66, 67, 71, 76, 77, 82, 100, 102, 107], "00": 77, "000": 107, "0000": [7, 49, 77], "0000e": 77, "0005": 77, "001": [65, 66, 67], "0036": 77, "0047": 77, "00894": 107, "00902": [102, 107], "00904": 102, "01": [65, 66, 67, 77, 107], "0100": 77, "012766935862600803": 77, "018": [102, 107], "0182": 102, "0200": 77, "0202": 45, "0237": 49, "0251": 7, "03": 107, "0300": 77, "0309": 7, "0310": 77, "0310j": 77, "0334": 77, "0334j": 77, "0364": 7, "0411": 77, "044715": [8, 50], "04745": 19, "05": [9, 15, 51, 59, 77, 87, 107], "0528": 49, "0545e": 77, "0558": 77, "0584": 77, "0633": 77, "0633j": 77, "0635": 7, "0638": 77, "0736": 45, "0748": 7, "0779": 77, "0793": 77, "08": 77, "08909505605697632": 77, "0949e": 77, "0985": 45, "0994": 45, "1": [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 36, 39, 45, 46, 47, 49, 50, 54, 56, 57, 58, 60, 61, 62, 63, 65, 66, 67, 71, 77, 82, 87, 100, 102, 104, 107], "10": [7, 9, 14, 36, 45, 49, 74, 77, 102, 107], "100": [3, 36, 47, 77], "1000": [7, 36], "10000": 21, "1000000000": 77, "1007": 77, "103": [21, 24, 103], "1055": 77, "1057999968": 77, "106": 77, "1061112040": 77, "1064483442": 77, "1069546515": 77, "1071760287": 77, "1073668768": 77, "1082790149": 77, "1084397505": 77, "1084649136": 77, "1088634448": 77, "1089989247": 77, "1097310419": 77, "10_000": 21, "10mm": [65, 66, 67], "11": 77, "1101533110": 77, "1105482831": 77, "110mm": [65, 66, 67], "111": 77, "112": 77, "1123489973": 77, "1124191867": 77, "1151": 7, "1158": 77, "117": 77, "12": 77, "127": 77, "128": [10, 11, 60, 77], "13": 102, "13mm": [65, 66, 67], "14": 77, "141592": 77, "1416": 77, "1450": 45, "146": 77, "147": 77, "1472": 21, "15": 77, "152": 77, "1520": 77, "1535": 7, "154": 77, "15mm": 67, "16": [2, 6, 46, 77, 87, 107], "1610": 45, "1611": 77, "165": 77, "1655": 7, "1685": 7, "17": 45, "177": [77, 107], "18": [45, 77], "182": 77, "185": 77, "1867": 49, "1868": 45, "188": 77, "189": 77, "19": 45, "190": 77, "191": 77, "192": 77, "198": 107, "1d": [2, 3, 46], "1e": [9, 15, 51, 59, 65, 66, 67, 77, 107], "2": [2, 6, 7, 8, 9, 15, 16, 17, 37, 45, 49, 50, 52, 53, 54, 60, 65, 66, 67, 77, 87, 100, 102, 107], "20": [2, 5, 6, 9, 10, 11, 45, 46, 77], "2002": 19, "202": 77, "2022": 90, "2023": 104, "203": 77, "204": [102, 107], "205": 77, "206": 77, "21": 45, "2109": 77, "22": 77, "226": 77, "227": 77, "23": 77, "2300": 77, "2316": 77, "2326": 77, "235": [77, 102], "236": 102, "2397": 77, "240": 77, "2419": 49, "243": 77, "2438": 49, "246": 77, "25": 77, "252": 77, "253": 77, "254": 77, "2569": 77, "2678": 7, "27": 77, "28": 77, "288": 102, "289": 107, "2976": 77, "2_t": [65, 66], "2d": [3, 77], "2nd": 77, "3": [2, 3, 7, 8, 9, 17, 45, 47, 49, 50, 60, 65, 66, 67, 77, 104, 107], "30": [10, 11, 46, 77], "3000": 7, "31": 102, "310": 102, "3100": 77, "32": [60, 77], "3216": 45, "322": 107, "33": [2, 46], "3310": 77, "337": 102, "3385": 49, "3448": 7, "3497": 45, "3506": 45, "3553": 77, "3553j": 77, "36": 77, "3616": 7, "3618": 49, "3667": 45, "3672": 45, "3677": 7, "3743": 45, "38": 107, "3925": 45, "3930": 45, "3944": 45, "3e": 107, "3rd": 77, "4": [7, 13, 45, 49, 77, 100, 102, 107], "4004": 7, "4022": 45, "4090": 45, "41": 77, "4161": 49, "43": 77, "4362": 7, "44": 77, "45": 107, "4600": 77, "4711": 77, "4724": 77, "48": 107, "4842e": 77, "4926": 77, "4970": 7, "4999": 77, "4pt": [65, 66, 67], "5": [3, 5, 6, 7, 8, 9, 15, 36, 45, 46, 47, 48, 49, 50, 52, 60, 65, 66, 67, 77, 107], "50": [2, 107], "5044": 77, "51": 102, "5138": 77, "5158": 77, "52587890625e": 87, "53": 77, "5316": 77, "5316j": 77, "5384": 49, "54": 102, "5415473580360413": 77, "544": 76, "5445": 77, "5503": 77, "5609": 49, "5617": 77, "5765e": 77, "578": [102, 107], "5803": 7, "5811": 45, "59": 77, "5mm": [65, 66, 67], "6": [7, 45, 77, 102], "60": 107, "60521": 77, "6056": 45, "6108": 49, "6123": 77, "6123j": 77, "6147": 45, "6245": 45, "6246": 49, "6262": 49, "63": 77, "64": [5, 60, 107], "6431": 7, "6492": 77, "6550": 77, "6606": 77, "6753": 49, "6778": 7, "6902": 7, "693": 107, "6969": 7, "6971": 45, "6e": 107, "7": [45, 77], "706": 107, "707": 107, "7089": 7, "716": 107, "7172": 7, "72": 77, "7265": 7, "729": 107, "7383": 49, "7471": 49, "7472": 77, "7472j": 77, "7695": 45, "7724": 45, "7729271650314331": 77, "7761": 49, "7794": 49, "7845": 45, "7895": 7, "7896": 77, "7896j": 77, "8": [45, 60, 65, 66, 77, 107], "8017": 45, "8119": 77, "8119j": 77, "8182e": 77, "83": 107, "8303": 77, "84": 107, "8490": 49, "8499": 77, "8499j": 77, "8612": 49, "8649": 77, "8649j": 77, "867061780": 77, "87": 77, "8720": 49, "89": 77, "9": [7, 49, 65, 66, 67, 77], "9124": 7, "93": 77, "9315": 7, "9400": 7, "9417": 77, "9482": 77, "95": 77, "9607": 77, "9625": 49, "9666": 49, "9678": 77, "9685": 7, "9732": 45, "9751": 49, "979": 107, "9853": 77, "9897": 7, "9902": 45, "9913": 77, "9913j": 77, "999": [65, 66, 102, 107], "A": [3, 4, 5, 6, 7, 12, 13, 17, 18, 19, 47, 49, 54, 60, 63, 65, 66, 67, 77, 84, 85, 104, 107], "As": [77, 90, 107], "At": 2, "But": 77, "By": [3, 47, 77, 92, 95, 97, 107], "For": [0, 2, 7, 9, 24, 36, 39, 46, 54, 60, 65, 66, 67, 71, 77, 82, 91, 100, 103, 104, 107], "If": [2, 3, 6, 7, 9, 10, 11, 39, 46, 47, 48, 49, 52, 53, 54, 60, 63, 65, 66, 67, 73, 77, 82, 89, 90, 100, 101], "In": [2, 8, 10, 11, 16, 17, 23, 38, 45, 46, 50, 52, 53, 54, 58, 60, 61, 63, 73, 77, 92, 95, 100, 107], "It": [2, 3, 5, 24, 60, 63, 65, 66, 67, 77, 100, 103, 107], "Its": [65, 66, 67], "NOT": [65, 66, 67], "No": 100, "On": [2, 5, 10, 11, 54, 65, 66, 67], "One": 77, "Such": 76, "The": [0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 22, 23, 27, 38, 39, 41, 45, 46, 47, 49, 50, 52, 53, 54, 58, 60, 61, 63, 65, 66, 67, 74, 77, 82, 85, 86, 87, 89, 90, 91, 92, 94, 100, 101, 104, 107], "There": [60, 77, 100, 107], "These": [77, 91, 107], "To": [60, 77, 97, 101, 104, 107], "Will": 101, "_": [2, 6, 60, 65, 66, 67, 100], "__": 67, "__init__": [60, 102, 107], "_c": [102, 107], "_channel": [2, 46], "_dim": 7, "_dynamo": [85, 94, 95], "_featur": [10, 11, 52, 53], "_freez": 7, "_i": 45, "_index": 3, "_j": 2, "_modul": 94, "_nn": [102, 107], "_nnz": 77, "_shape": 9, "_size": 2, "_t": 67, "_tau": 18, "_tensor": 77, "_weight": 7, "a_dual": 100, "ab": [19, 77], "abl": [77, 92, 107], "about": [77, 84], "abov": [5, 60, 77, 107], "abs_": 77, "abs_max": [23, 84], "abs_mean": [23, 84], "abs_min": [23, 84], "absolut": [41, 77], "absolute_": 77, "accept": [3, 5, 77, 100], "access": [65, 66, 67, 77, 91, 100], "accompani": 96, "accord": [60, 77, 92], "accordingli": 17, "accumul": 77, "aco": 77, "acos_": 77, "acosh": 77, "acosh_": 77, "across": 77, "act": 76, "activ": [9, 16, 61, 89, 107], "actual": [65, 66, 67, 77], "ad": [0, 2, 5, 9, 15, 60, 65, 66, 67, 77, 91, 105, 107], "adam": [64, 66, 69, 71, 104], "adam_lr_scale_func": 71, "adamw": [64, 69, 104], "adapt": [6, 107], "add": [2, 4, 56, 65, 66, 67, 77, 92, 104, 105, 107], "add_": 77, "add_param_group": [65, 66, 67], "addbmm": 77, "addbmm_": 77, "addcdiv": 77, "addcdiv_": 77, "addcmul": 77, "addcmul_": 77, "addit": [9, 10, 11, 17, 45, 52, 53, 65, 66, 67, 77, 91, 107], "addition": 77, "addmm": 77, "addmm_": 77, "addmv": 77, "addmv_": 77, "addr": 77, "addr_": 77, "address": [77, 85, 107], "adjoint": 77, "adjust": 17, "advanc": 34, "advantag": 107, "advis": 77, "affect": 77, "affin": [9, 77], "after": [4, 5, 27, 54, 65, 66, 67, 77, 92, 100], "against": 104, "agnost": 77, "aim": 107, "al": 90, "algorithm": [2, 8, 46, 60, 65, 66], "alia": 77, "alias": 25, "align": [3, 47, 60, 65, 66, 67, 77], "align_a": 77, "align_to": 77, "all": [2, 4, 7, 10, 11, 22, 23, 24, 39, 49, 60, 63, 65, 66, 67, 73, 77, 82, 86, 87, 88, 92, 100, 103, 107], "allclos": 77, "alloc": 77, "allow": [3, 5, 58, 77, 85, 107], "allow_non_unit_scaling_param": [65, 66, 67, 71], "alon": 92, "along": [17, 21, 63, 65, 66, 67, 77, 107], "alongsid": 107, "alpha": [36, 45, 77], "alreadi": [65, 66, 67, 77], "also": [0, 2, 3, 14, 16, 57, 61, 65, 66, 74, 75, 77, 91, 92, 100, 104, 107], "altern": [5, 39, 64, 82, 107], "although": [105, 107], "altogeth": [65, 66, 67], "alwai": [60, 77, 91, 100], "amax": 77, "amean": [2, 8, 10, 11, 16, 17, 38, 45, 46, 50, 52, 53, 54, 61, 63, 104], "amin": 77, "aminmax": 77, "amount": [2, 3, 47], "amsgrad": [65, 66], "an": [2, 3, 4, 5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 21, 22, 23, 24, 32, 38, 39, 46, 48, 49, 54, 58, 60, 65, 66, 67, 71, 73, 74, 77, 82, 86, 87, 88, 89, 91, 92, 99, 100, 101, 103, 104, 107], "analog": 77, "analys": [20, 102, 107], "analyse_modul": [104, 107], "analysi": [91, 92, 102, 103, 104, 107], "analyt": 49, "angl": 77, "ani": [2, 5, 6, 7, 8, 9, 10, 11, 16, 17, 23, 24, 32, 46, 52, 53, 60, 64, 65, 66, 67, 71, 77, 88, 91, 92, 94, 95, 96, 100, 101, 102, 103, 104, 105, 107], "annot": [14, 22, 74, 102], "annotated_cod": 107, "anoth": [7, 77], "answer": 107, "anticip": 92, "anyon": 105, "api": [77, 101, 104, 107], "appear": 77, "append": [4, 5, 18, 54, 77, 107], "appendix": [39, 82], "appli": [0, 2, 3, 5, 8, 9, 10, 11, 12, 15, 16, 17, 21, 22, 23, 27, 38, 45, 46, 47, 48, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 71, 77, 80, 81, 85, 86, 87, 92, 94, 95, 100, 107], "applic": [3, 47, 77, 100], "apply_": 77, "apply_constraint": 104, "apply_transform": 104, "approach": [92, 94, 106, 107], "appropri": [11, 71, 77], "approxim": [8, 16, 50, 61, 107], "ar": [0, 2, 3, 4, 5, 6, 9, 10, 11, 15, 17, 23, 24, 39, 47, 49, 54, 58, 60, 65, 66, 67, 73, 76, 77, 82, 84, 87, 89, 90, 91, 92, 94, 100, 101, 103, 106, 107], "arang": 77, "arbitrari": [7, 22, 23, 24, 49, 60, 77, 92, 99, 103, 107], "arbitrarili": 77, "arcco": 77, "arccos_": 77, "arccosh": 77, "arccosh_": 77, "architectur": [3, 47], "arcsin": 77, "arcsin_": 77, "arcsinh": 77, "arcsinh_": 77, "arctan": 77, "arctan2": 77, "arctan2_": 77, "arctan_": 77, "arctanh": 77, "arctanh_": 77, "arg": [3, 5, 24, 38, 47, 65, 66, 67, 75, 77, 96, 100, 101, 103], "argmax": 77, "argmin": 77, "args_list": 101, "argsort": 77, "argument": [2, 3, 8, 50, 54, 60, 65, 66, 67, 77, 100, 101, 107], "argwher": 77, "aris": 107, "arithmet": 26, "around": [0, 77, 92, 94, 105], "arrai": 77, "array_lik": 77, "arriv": 107, "arrow": 23, "art": 107, "arxiv": 19, "as_strid": 77, "as_strided_": 77, "as_strided_scatt": 77, "as_subclass": 77, "asin": 77, "asin_": 77, "asinh": 77, "asinh_": 77, "assert": [14, 60, 74], "assign": [3, 77], "assist": 105, "associ": [65, 66, 67, 91], "assum": [0, 30, 31, 32, 33, 107], "assumpt": 107, "async": 77, "asynchron": 77, "atan": 77, "atan2": 77, "atan2_": 77, "atan_": 77, "atanh": 77, "atanh_": 77, "atol": 77, "attach": 77, "attain": 107, "attempt": [60, 107], "attent": [12, 18, 19, 21, 39, 60, 82, 107], "attn": [39, 82], "attn_bia": 60, "attn_mask": [21, 24, 60, 103], "attn_weight": 60, "attribut": [65, 66, 67, 72, 76, 77, 100, 101], "autograd": [52, 53, 54, 65, 66, 67, 77, 100], "automat": [4, 5, 23, 24, 60, 77, 91, 92, 95, 100, 102, 103, 107], "autowrap_funct": 102, "autowrap_modul": 102, "avail": [60, 91, 107], "averag": [3, 39, 47, 65, 66, 82], "avoid": 77, "awar": 77, "aweight": 77, "ax": [23, 24, 103], "axi": [23, 77], "axis0": 77, "axis1": 77, "b": [45, 52, 53, 67, 77, 100], "back": 77, "backend": [2, 46, 60, 85, 94, 95], "backend_1": 94, "backend_2": 94, "backward": [2, 3, 10, 11, 22, 23, 24, 41, 47, 54, 58, 65, 66, 67, 77, 79, 80, 84, 86, 87, 89, 90, 91, 92, 99, 100, 101, 102, 103, 107], "baddbmm": 77, "baddbmm_": 77, "bar": [23, 24, 103], "base": [24, 52, 53, 60, 67, 71, 76, 100, 103, 107], "basic": [92, 107], "batch": [2, 3, 7, 9, 21, 24, 47, 49, 54, 65, 66, 67, 77, 103], "batch1": 77, "batch2": 77, "batch_siz": [21, 24, 100, 103, 107], "becaus": [77, 107], "becom": [2, 3, 5, 47, 77], "been": [22, 23, 27, 77, 86, 87, 92, 100, 107], "befor": [4, 63, 65, 66, 67, 77, 89, 90, 100, 101, 107], "begin": [3, 47, 65, 66, 67, 73, 77, 107], "behav": [58, 65, 66, 67, 77], "behavior": [54, 65, 66, 67, 77, 100], "behavour": 102, "being": [3, 7, 47, 58, 65, 66, 67, 77, 85, 94, 100, 101], "belong": 101, "below": [0, 9, 17, 47, 107], "benchmark": 107, "benefici": 58, "bernoulli": [6, 48, 77], "bernoulli_": 77, "best": [105, 107], "beta": [9, 52, 53, 54, 60, 65, 66, 77, 104, 107], "beta_1": [65, 66], "beta_2": [65, 66], "better": [3, 60, 92], "between": [2, 3, 5, 23, 36, 39, 46, 47, 58, 65, 66, 67, 77, 82, 107], "beyond": [3, 65, 66, 107], "bf": [65, 66, 67], "bf16": 107, "bfloat16": [60, 65, 66, 67, 77], "bia": [2, 9, 10, 11, 14, 46, 51, 52, 53, 60, 74, 102, 107], "bias": [9, 92], "big": [65, 66], "bin": 77, "binari": 77, "bincount": 77, "bit": [41, 77], "bitwise_and": 77, "bitwise_and_": 77, "bitwise_left_shift": 77, "bitwise_left_shift_": 77, "bitwise_not": 77, "bitwise_not_": 77, "bitwise_or": 77, "bitwise_or_": 77, "bitwise_right_shift": 77, "bitwise_right_shift_": 77, "bitwise_xor": 77, "bitwise_xor_": 77, "blend": 3, "block": [2, 77, 107], "blocksiz": 77, "bmm": 77, "bool": [2, 3, 6, 7, 9, 10, 11, 12, 15, 16, 19, 23, 24, 37, 47, 48, 49, 55, 60, 61, 65, 66, 67, 71, 77, 96, 100, 101, 102, 103], "boolean": [9, 15, 60, 77, 100], "booltensor": 77, "both": [2, 9, 39, 46, 54, 60, 77, 82, 100, 102, 107], "bound": 77, "box": [64, 101, 104, 107], "boxed_run": 101, "branch": [19, 56, 57, 58, 107], "breakdown": 107, "brief": 107, "broad": 104, "broadcast": [45, 54, 60, 77], "broadcast_to": 77, "bsc": 77, "bsr": 77, "buffer": 77, "bug": [104, 105, 107], "build": [92, 107], "built": 0, "bwd": [84, 102, 107], "bwd_format": 89, "bwd_scale": 107, "byte": 77, "c": [2, 3, 9, 45, 47, 60, 76, 77, 100], "c_": 2, "cach": [77, 85, 94], "calcul": [9, 39, 68, 69, 70, 82], "call": [5, 6, 22, 23, 24, 60, 65, 66, 67, 77, 85, 86, 87, 89, 90, 91, 92, 94, 95, 96, 97, 100, 101, 102, 103], "call_funct": 101, "call_method": 101, "call_modul": 101, "callabl": [18, 38, 39, 57, 65, 66, 67, 70, 71, 77, 82, 88, 92, 94, 95, 96, 101, 102, 107], "can": [0, 2, 3, 4, 5, 6, 7, 23, 46, 58, 60, 65, 66, 67, 76, 77, 85, 87, 91, 92, 100, 101, 104, 105, 107], "cannot": [60, 77], "captur": [65, 66, 95, 97], "care": 107, "carefulli": 77, "carri": 77, "cascad": 5, "case": [2, 3, 8, 10, 11, 16, 17, 24, 38, 45, 46, 47, 50, 52, 53, 54, 61, 63, 65, 66, 67, 73, 77, 87, 92, 95, 97, 103, 107], "cast": [63, 77], "cat": 77, "cauchi": 77, "cauchy_": 77, "caus": [4, 5, 77, 107], "causal": [12, 19, 60], "causalbia": 60, "ccol_indic": 77, "cdot": 3, "cdoubl": 77, "ceil": 77, "ceil_": 77, "center": 77, "certain": [2, 10, 11, 51, 54, 59, 65, 66, 67, 87, 101], "cfloat": 77, "chain": [5, 77, 91], "chalf": 77, "chang": [3, 4, 5, 7, 8, 12, 16, 17, 23, 47, 50, 60, 61, 62, 63, 65, 66, 67, 77, 85, 87, 107], "channel": [2, 6, 9, 77], "channels_last": 77, "char": 77, "charact": 77, "characterist": [65, 66, 67], "check": [71, 76, 77, 78, 100, 107], "checker": 76, "choleski": 77, "cholesky_invers": 77, "cholesky_solv": 77, "choos": 100, "chosen": [6, 30, 31, 32, 33, 60], "chrome": 77, "chunk": [21, 77], "circular": 2, "circumst": [2, 46, 60], "cl": 77, "clamp": 77, "clamp_": 77, "class": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 40, 41, 47, 49, 60, 63, 64, 65, 66, 67, 72, 73, 75, 76, 77, 83, 84, 92, 97, 98, 99, 100, 101, 102, 107], "classif": 3, "classmethod": 7, "clear": [73, 101], "clearer": 23, "clearli": 107, "click": 0, "clip": [77, 107], "clip_": 77, "clone": [77, 100, 104, 107], "close": 0, "closur": [65, 66, 67], "co": [6, 77], "coalesc": 77, "code": [0, 5, 24, 77, 102, 103, 106, 107], "codebas": [104, 107], "coeffici": [65, 66], "coexist": 77, "coin": [16, 61], "col_indic": 77, "colour": [23, 24, 103], "column": [49, 77], "com": 77, "combin": [24, 52, 53, 54, 56, 57, 77, 85, 103, 107], "come": [56, 77, 85], "common": [1, 25, 44, 45, 65, 66, 67, 105, 107], "commun": 77, "compar": [92, 104], "compat": [77, 101], "compil": [92, 94, 95, 104, 105, 107], "complet": [77, 88, 105], "complex": [2, 45, 46, 77, 101], "complex128": [2, 46, 77], "complex32": [2, 46, 77], "complex64": [2, 46, 77], "compon": 34, "compos": [46, 85], "comprehens": 107, "compress": 77, "compromis": 107, "comput": [3, 6, 7, 9, 17, 26, 28, 29, 37, 39, 47, 49, 55, 60, 63, 65, 66, 77, 82, 100, 107], "compute_uv": 77, "concaten": 2, "concern": 107, "concret": [77, 101], "conda": 102, "condit": 77, "configur": [4, 5], "conj": 77, "conj_phys": 77, "conj_physical_": 77, "conjug": 77, "conjunct": [56, 58, 91, 95, 107], "connect": [2, 5, 19, 56, 57, 58, 92, 107], "consecut": 77, "consid": [3, 77, 107], "consider": 104, "constant": 36, "constrain": [26, 28, 29, 92, 107], "constraint": [2, 8, 10, 11, 16, 17, 19, 38, 45, 46, 50, 52, 53, 54, 60, 61, 63, 67, 101, 104, 107], "constraint_nam": 27, "construct": [4, 5, 7, 14, 49, 74, 77], "constructor": 5, "consum": 77, "contain": [3, 4, 5, 7, 22, 47, 49, 65, 66, 67, 77, 84, 86, 89, 90, 91, 96, 99, 100, 107], "content": [65, 66, 67, 100], "context": [60, 65, 66, 67, 77, 100], "contigu": 77, "contiguous_format": 77, "continu": 77, "contrast": 107, "contribut": [3, 7, 39, 47, 49, 56, 57, 58, 82], "control": [2, 18, 60, 64, 107], "conv": 2, "conv1": 5, "conv1d": 104, "conv2": 5, "conv2d": 5, "conveni": [24, 94, 103], "convent": [77, 101], "converg": [65, 66], "convers": [77, 107], "convert": [22, 42, 77, 97, 107], "convolut": [2, 46], "convolv": [2, 46], "coo": 77, "coordin": 77, "copi": [65, 66, 67, 73, 77, 92], "copy_": 77, "copysign": 77, "copysign_": 77, "core": [18, 104], "corrcoef": 77, "correct": [77, 89, 90, 100, 107], "correctli": 77, "correl": 2, "correspond": [3, 7, 8, 12, 16, 17, 24, 27, 43, 47, 49, 50, 60, 61, 62, 63, 65, 66, 67, 73, 77, 92, 100, 103], "cos_": 77, "cosh": 77, "cosh_": 77, "cost": [2, 46, 60, 77], "could": [77, 107], "count": 77, "count_nonzero": 77, "counterintuit": 77, "cov": 77, "cover": 107, "coverag": 105, "cpu": 77, "cpython": 102, "creat": [5, 7, 71, 73, 77], "create_graph": 77, "creation": 77, "criterion": 3, "cross": [2, 3, 47, 77, 107], "cross_entropi": [104, 107], "crossentropyloss": [47, 104, 107], "crow_indic": 77, "csc": 77, "csr": 77, "ctx": 100, "cuda": [2, 46, 60, 65, 66, 67, 77], "cuda0": 77, "cudacachingalloc": 77, "cudnn": [2, 46, 60], "cummax": 77, "cummin": 77, "cumprod": 77, "cumprod_": 77, "cumsum": 77, "cumsum_": 77, "cumul": [8, 50], "current": [18, 24, 60, 65, 66, 67, 77, 85, 92, 94, 101, 103, 104, 105, 106, 107], "custom": [77, 92, 100], "cycl": 77, "d": [9, 52, 53, 73, 77, 100, 102, 107], "d_1": [3, 47], "d_2": [3, 47], "d_k": [3, 47], "dampen": 67, "data": [2, 9, 10, 11, 14, 21, 46, 52, 53, 63, 74, 77, 84, 101], "data_ptr": 77, "dataclass": 99, "datafram": 22, "dataload": 77, "dataset": [21, 24, 103], "dataset_nam": [21, 24, 103], "dataset_path": [21, 24, 103], "dealloc": [77, 101], "decai": [65, 66, 67, 71], "decid": 77, "decim": 77, "decod": 18, "decompos": 107, "decor": [76, 100], "decoupl": 66, "deem": [23, 24, 87, 103], "deep": [67, 107], "def": [60, 76, 77, 92, 100, 102, 107], "default": [2, 3, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 21, 23, 24, 38, 39, 45, 46, 47, 48, 49, 50, 52, 53, 54, 56, 57, 58, 60, 61, 63, 65, 66, 67, 71, 73, 77, 82, 87, 92, 94, 95, 96, 97, 100, 102, 103, 107], "defin": [17, 27, 60, 63, 64, 65, 66, 67, 76, 77, 92, 100, 107], "definit": [92, 104, 106], "deg2rad": 77, "deg2rad_": 77, "degrad": 107, "del": 77, "delai": [58, 77, 107], "demonstr": [18, 104], "denomin": [9, 15, 65, 66], "denot": [2, 23, 77, 107], "dens": 77, "dense_dim": 77, "densiti": 77, "depend": [3, 23, 47, 54, 60, 71, 77, 92, 101], "deprec": [3, 47, 77], "depth": [4, 5, 39, 68, 82], "depthmodulelist": 104, "depthsequenti": 104, "depthwis": 2, "dequant": 77, "deriv": [77, 107], "descend": [77, 101], "describ": [2, 3, 6, 9, 15, 39, 47, 77, 82, 106], "design": [104, 107], "desir": [63, 77, 107], "despit": 105, "destin": 77, "det": 77, "detach": 77, "detach_": 77, "detail": [2, 7, 8, 10, 11, 16, 17, 38, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 61, 63, 65, 66, 76, 77, 92, 100, 101, 105, 107], "detector": 6, "determin": [21, 54, 107], "determinist": [2, 7, 46, 49, 60, 77], "develop": [98, 102, 107], "deviat": [9, 77, 99, 100, 101, 102, 107], "devic": [2, 7, 9, 10, 11, 46, 52, 53, 54, 60, 77, 101], "dfrac": 77, "diag": 77, "diag_emb": 77, "diagflat": 77, "diagon": [60, 77], "diagonal_scatt": 77, "dict": [65, 66, 67, 71, 73, 77, 92, 96, 101], "dictionari": [7, 65, 66, 67, 73, 77, 92], "dictionaryand": 49, "did": [65, 66, 67], "diff": 77, "differ": [2, 5, 10, 11, 24, 49, 54, 58, 60, 65, 66, 67, 77, 79, 85, 94, 103, 107], "differenti": [41, 65, 66, 67, 77, 100], "difficulti": 105, "digamma": 77, "digamma_": 77, "dilat": [2, 46], "dim": [3, 17, 37, 47, 60, 63, 77], "dim0": 77, "dim1": 77, "dim2": 77, "dim_ord": 77, "dimens": [3, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 46, 51, 52, 53, 54, 59, 63, 77, 100], "dimension": [3, 7, 9, 17, 47, 54, 77], "directli": [77, 100, 105, 107], "disabl": [60, 71], "discret": 77, "discuss": 107, "displai": 23, "dist": 77, "distribut": [3, 6, 8, 47, 48, 50, 77, 105, 107], "div": 77, "div_": 77, "divid": [47, 77], "divide_": 77, "divis": [2, 46, 77, 107], "divisor": 77, "do": [3, 6, 7, 48, 49, 65, 66, 67, 77, 100, 101, 106, 107], "doc": [0, 105], "document": [0, 7, 17, 107], "doe": [2, 3, 4, 5, 7, 47, 54, 60, 65, 66, 67, 71, 73, 77, 85, 94, 107], "doesn": [2, 46, 85, 92, 94, 95, 97, 100], "don": [23, 24, 65, 66, 67, 86, 87, 103, 107], "done": [77, 92], "dot": [3, 54, 60, 77], "doubl": [77, 100], "down": 107, "downcast": 77, "downstream": 64, "draw": 77, "drawn": 77, "drift": 107, "drop": [77, 107], "dropout": [12, 18, 19, 60, 104], "dropout_p": [12, 18, 19, 60], "dsplit": 77, "dtype": [2, 3, 7, 9, 10, 11, 47, 52, 53, 54, 60, 63, 77, 100, 101], "dual_level": 100, "duck": 76, "due": [60, 65, 66, 67, 77], "dummi": 102, "duplic": 77, "dure": [6, 7, 23, 24, 49, 60, 77, 100, 103, 107], "dw": 46, "dynamo": 83, "dynload": 102, "e": [2, 3, 7, 9, 46, 49, 54, 60, 73, 77, 85, 86, 87, 91, 100], "e4": [90, 107], "e5": [90, 107], "each": [2, 3, 5, 6, 7, 9, 10, 11, 23, 24, 46, 47, 49, 52, 60, 65, 66, 67, 77, 89, 90, 91, 100, 101, 102, 103, 107], "eager": 107, "earli": 77, "easi": [92, 94], "easier": 107, "easili": 0, "effect": [6, 77, 89, 90, 107], "effici": [60, 77, 100, 107], "effort": 105, "einop": 102, "either": [2, 3, 47, 73, 77, 100], "element": [2, 3, 6, 9, 15, 17, 38, 46, 47, 48, 50, 55, 60, 63, 73, 77], "element_s": 77, "elementwise_affin": [9, 15], "elimin": [77, 107], "ell": 3, "ellipsi": 77, "els": [60, 65, 66, 67, 73, 77], "embed": [9, 18, 39, 82, 104], "embedding_dim": [7, 9, 49], "embedding_matrix": 49, "emploi": 107, "empti": [3, 77], "enabl": [60, 67, 71, 77, 79, 92, 94, 100, 107], "enable_flash_sdp": 60, "enable_gqa": 60, "enable_grad": 77, "enable_io_process": 101, "enable_math_sdp": 60, "enable_mem_efficient_sdp": 60, "encount": [104, 107], "encourag": 105, "end": [3, 4, 5, 18, 47, 65, 66, 67, 73, 77, 107], "end_dim": 77, "engin": [77, 100], "enough": 107, "ensur": [39, 60, 77, 82, 100, 101, 107], "entir": [9, 77, 107], "entri": [7, 49, 65, 66, 67, 77], "entropi": [3, 47, 107], "env": 100, "environ": 101, "ep": [9, 15, 37, 51, 59, 65, 66], "epsilon": [9, 15, 37, 65, 66], "eq": 77, "eq_": 77, "equal": [30, 31, 32, 33, 39, 49, 56, 57, 58, 77, 82, 107], "equal_nan": 77, "equat": 106, "equival": [0, 2, 3, 7, 9, 60, 77, 89, 90, 92, 100, 102, 107], "erf": 77, "erf_": 77, "erfc": 77, "erfc_": 77, "erfinv": 77, "erfinv_": 77, "error": [8, 16, 23, 24, 50, 55, 60, 61, 77, 100, 101, 103], "especi": 77, "estim": [8, 9, 50], "et": 90, "etc": 3, "ev": 60, "evalu": [6, 9, 60, 101, 107], "even": [46, 54, 77, 107], "evenli": 77, "event": [60, 77], "everi": [6, 17, 39, 77, 82, 100, 101], "ex": [65, 66, 67], "exact": [77, 107], "exactli": [5, 107], "exampl": [2, 3, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 36, 45, 46, 47, 49, 54, 60, 65, 66, 67, 71, 76, 77, 100, 101, 102, 105, 107], "example_batch": [24, 103, 104], "except": 77, "execut": [77, 85, 94, 101], "exhaust": 105, "exist": [0, 73, 77, 92, 107], "exp": [3, 17, 63, 77], "exp2": 77, "exp2_": 77, "exp_": 77, "expand": [77, 100], "expand_a": 77, "expansion_factor": 13, "expect": [3, 9, 15, 49, 77, 89, 90, 107], "experi": [16, 61], "experiment": [24, 60, 77, 89, 90, 92, 103], "explicit": 77, "explicitli": 92, "explod": 107, "expm1": 77, "expm1_": 77, "expon": 77, "exponent_bit": [41, 42, 43], "exponenti": 77, "exponential_": 77, "export_chrome_trac": 77, "express": [77, 92, 96], "extend": [4, 72, 100], "extra": [75, 107], "extract": [7, 49, 101], "ey": 77, "f": [0, 38, 46, 47, 49, 60, 65, 66, 67, 73, 77, 107], "f_t": [65, 66, 67], "facilit": 104, "fact": [77, 107], "factor": [6, 13, 30, 31, 32, 33, 58, 60, 67, 68, 69, 70, 71, 79, 80, 81, 92, 107], "fail": [71, 77, 107], "faint": [23, 24, 103], "fals": [2, 3, 6, 7, 9, 10, 11, 15, 16, 23, 37, 47, 48, 49, 60, 61, 65, 66, 67, 71, 73, 77, 100, 102], "fan_in": 107, "fan_out": 107, "far": 107, "fashion": 77, "faster": 60, "favor": [56, 57, 58], "fc1": 102, "fc1_bia": 102, "fc1_weight": 102, "fc2": 102, "fc2_bia": 102, "fc2_weight": 102, "featur": [6, 52, 53, 54, 60, 104, 105, 107], "fed": [102, 107], "fetch": 101, "fetch_args_kwargs_from_env": 101, "fetch_attr": 101, "fewer": [65, 66, 67], "ffn": [39, 82], "field": [3, 47, 75, 77], "fifo": 73, "fill": 77, "fill_": 77, "fill_diagonal_": 77, "fill_uninitialized_memori": 77, "fill_valu": 77, "filter": [2, 46, 77], "final": [5, 11, 53, 54, 91, 92], "find": 107, "fine": [60, 65, 66, 67, 77], "finer": 64, "finish": 77, "fire": [65, 66, 67], "first": [0, 5, 7, 38, 54, 65, 66, 77, 85, 94, 100, 101, 107], "firstli": 107, "fit": 77, "five": 92, "fix": [7, 49, 77, 107], "fix_": 77, "flag": [65, 66, 67], "flash": 60, "flash_attent": 60, "flashattent": 60, "flatten": 77, "flatter": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "flexibl": 107, "flip": 77, "fliplr": 77, "flipud": 77, "float": [3, 6, 7, 8, 9, 12, 15, 16, 17, 18, 19, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 36, 37, 38, 39, 41, 45, 46, 47, 48, 49, 50, 51, 52, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 77, 80, 81, 82, 84, 86, 87, 91, 99, 101, 103, 107], "float16": [2, 10, 11, 54, 60, 65, 66, 67, 77], "float32": [65, 66, 67, 77], "float64": [60, 65, 66, 67, 77], "float_pow": 77, "float_power_": 77, "floattensor": [7, 77], "floor": 77, "floor_": 77, "floor_divid": 77, "floor_divide_": 77, "fmax": 77, "fmin": 77, "fmod": 77, "fmod_": 77, "fn": [39, 57, 82, 95], "focuss": 105, "follow": [0, 3, 24, 38, 54, 60, 65, 66, 67, 73, 77, 85, 91, 95, 100, 103, 104, 107], "footprint": [65, 66, 67], "foral": 77, "forc": 77, "foreach": [65, 66, 67], "fork": [104, 107], "form": [60, 107], "format": [77, 89, 90, 104, 107], "format_to_tupl": 104, "formul": 60, "formula": [67, 100], "fortun": 107, "forward": [5, 6, 22, 23, 24, 41, 60, 77, 79, 80, 81, 84, 85, 89, 90, 91, 92, 94, 95, 99, 100, 101, 102, 103, 107], "found": [73, 104, 107], "fp16": 107, "fp32": [89, 90, 107], "fp8": [90, 107], "fpformat": [42, 43, 89, 104], "frac": [2, 3, 6, 9, 10, 11, 15, 17, 46, 60, 63, 77], "frac_": 77, "free": 105, "freed": 77, "freez": 7, "frequenc": [7, 49], "frexp": 77, "fro": 77, "from": [0, 2, 4, 6, 7, 9, 10, 11, 17, 21, 23, 24, 48, 49, 65, 66, 67, 73, 77, 85, 86, 87, 89, 90, 91, 92, 94, 100, 101, 102, 103, 107], "from_numpi": 77, "from_pretrain": 7, "fromkei": 73, "front": 77, "frozen": [65, 66, 67], "fsdp": 77, "full": [0, 21, 46, 65, 66, 67, 77, 91, 100, 107], "fulli": 101, "func": [76, 100], "function": [0, 1, 2, 3, 5, 6, 8, 10, 11, 12, 16, 17, 18, 20, 22, 23, 25, 27, 34, 40, 64, 65, 66, 67, 72, 77, 79, 82, 83, 85, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 100, 101, 102, 104, 105, 107], "functionctx": 100, "fundament": 92, "further": [65, 66, 77, 107], "furthermor": [6, 77], "fuse": [60, 65, 66, 67, 107], "fusion": 107, "futur": [24, 77, 85, 100, 103], "fwad": 100, "fwd": [84, 107], "fwd_format": 89, "fwd_scale": 107, "fwd_tensor": 84, "fweight": 77, "fx": [22, 23, 85, 86, 87, 88, 91, 94, 95, 97, 101, 102, 107], "g": [22, 23, 39, 62, 65, 66, 77, 82, 86, 87, 91, 100], "g1": 100, "g2": 100, "g_": 67, "g_t": [65, 66, 67], "gamma": [9, 15, 65, 66, 67], "gate": [16, 61, 62], "gather": 77, "gaussian": [8, 16, 50, 61], "gc": 77, "gcd": 77, "gcd_": 77, "ge": 77, "ge_": 77, "gelu": [16, 61, 92, 104, 107], "gener": [3, 21, 22, 23, 24, 41, 65, 66, 67, 76, 77, 86, 87, 92, 102, 103, 107], "generate_vmap_rul": 100, "genproto": 76, "geometr": [28, 77, 107], "geometric_": 77, "geq": [3, 47], "geqrf": 77, "ger": 77, "get": [7, 71, 73, 77, 105, 107], "get_attr": 101, "get_devic": 77, "get_info": 77, "get_swap_module_params_on_convers": 77, "github": [77, 104, 107], "give": [2, 19, 77, 92, 104, 107], "given": [2, 3, 4, 5, 7, 18, 21, 39, 41, 43, 46, 47, 49, 60, 64, 73, 76, 77, 82, 86, 87, 88, 89, 90, 96, 100, 102, 107], "global": [60, 71, 85], "glorot": [29, 107], "gmean": [2, 8, 10, 11, 16, 17, 38, 45, 46, 50, 52, 53, 54, 61, 63, 104, 107], "gnu": 102, "go": [77, 100, 107], "good": 107, "got": 100, "gpu": 77, "gqa": 60, "grad": [32, 38, 46, 52, 65, 66, 67, 77, 80, 81, 100, 107], "grad_bias_scal": 107, "grad_fn": 77, "grad_input": 100, "grad_input_scal": [30, 32, 38, 107], "grad_out": 100, "grad_output": 100, "grad_weight_scal": 107, "gradient": [2, 3, 7, 8, 10, 11, 16, 17, 19, 30, 31, 32, 33, 38, 45, 46, 47, 49, 50, 52, 53, 54, 58, 61, 63, 65, 66, 67, 77, 84, 89, 92, 100, 107], "grain": 60, "graph": [22, 23, 24, 65, 66, 77, 86, 87, 88, 91, 92, 94, 95, 96, 97, 100, 101, 103], "graph_modul": 94, "graph_to_datafram": 104, "graphmodul": [85, 94, 101], "great": 107, "greater": [60, 77], "greater_": 77, "greater_equ": 77, "greater_equal_": 77, "greedili": 77, "ground": [3, 47], "group": [2, 26, 27, 28, 29, 46, 60, 64, 65, 66, 67, 71, 77], "grow": 107, "gt": 77, "gt_": 77, "guarante": [65, 66, 67, 77, 101, 105], "guid": [92, 104], "guidanc": 77, "gx": 100, "gy": 100, "gz": 100, "h": [7, 9, 60, 77], "h_": [10, 11], "ha": [2, 3, 6, 9, 15, 27, 46, 47, 54, 60, 65, 66, 67, 73, 77, 91, 92, 100, 106, 107], "half": [2, 60, 77], "hand": 5, "handl": [65, 66, 67, 77, 100, 101], "happen": [77, 100], "harder": 2, "hardshrink": 77, "harmon": 29, "has_nam": 77, "have": [2, 3, 7, 22, 23, 49, 52, 53, 54, 58, 65, 66, 67, 77, 86, 87, 89, 90, 91, 92, 100, 104, 105, 107], "haven": 105, "head": [12, 18, 19], "headroom": 107, "heavisid": 77, "heaviside_": 77, "height": 77, "help": [104, 105, 107], "henc": [77, 92], "here": [19, 60, 77, 92, 94, 105, 107], "hi": 107, "hidden": [12, 13, 18, 19], "hidden_s": [12, 13, 18, 19, 102], "hierarchi": 101, "high": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63, 77], "higher": [3, 60, 77], "histc": 77, "histogram": 77, "hmean": [2, 8, 10, 11, 16, 17, 38, 45, 46, 50, 52, 53, 54, 61, 63, 104], "hold": [65, 66, 67, 77], "home": 102, "hook": [65, 66, 67, 77, 100], "hop_length": 77, "horizont": [23, 24, 103], "host": 77, "how": [77, 100, 104], "howev": [2, 46, 65, 66, 67, 77, 92, 94, 107], "hq": 60, "hspace": [65, 66, 67], "hsplit": 77, "http": [19, 77], "huggingfac": [21, 24, 103], "hybrid": 77, "hyperparamet": [39, 82, 107], "hypot": 77, "hypot_": 77, "i": [0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 21, 22, 23, 24, 27, 32, 39, 46, 47, 48, 49, 50, 52, 53, 54, 58, 60, 61, 62, 63, 65, 66, 67, 73, 74, 77, 82, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 100, 101, 103, 104, 105], "i0": 77, "i0_": 77, "icml": 104, "id": [21, 65, 66, 67], "ideal": 107, "ident": [6, 107], "identif": 92, "identifi": [60, 77, 86, 92], "identit": 77, "idx": [77, 100], "igamma": 77, "igamma_": 77, "igammac": 77, "igammac_": 77, "ignor": [3, 47, 76, 77, 107], "ignore_index": [3, 47], "ignore_w": [7, 49], "imag": [2, 3, 9, 77], "imaginari": 77, "img": 77, "immedi": 77, "impair": [65, 66, 67], "implement": [0, 6, 9, 12, 13, 15, 17, 18, 19, 24, 35, 60, 65, 66, 67, 77, 85, 92, 94, 102, 103, 104, 106, 107], "impli": 77, "implicit": [2, 46], "implicitli": 75, "import": [0, 23, 67, 85, 86, 87, 91, 92, 107], "improv": [6, 60, 65, 66, 67], "in_channel": 2, "in_dim": 100, "in_featur": [10, 11], "incept": [3, 47], "includ": [10, 11, 52, 53, 77, 105], "include_self": 77, "inclus": 77, "incom": [2, 10, 11, 52, 53], "incorrect": [19, 77, 100], "increas": [2, 13, 46, 60, 100], "incur": 107, "indend": 22, "independ": [6, 71, 77, 92], "independent_weight_decai": [65, 66, 67, 71], "index": [3, 4, 7, 39, 49, 77, 82, 100], "index_add": 77, "index_add_": [77, 100], "index_copi": 77, "index_copy_": 77, "index_fil": 77, "index_fill_": 77, "index_put": 77, "index_put_": 77, "index_reduce_": 77, "index_select": 77, "indic": [3, 7, 23, 24, 47, 49, 60, 65, 66, 67, 77, 100, 103], "indices_or_sect": 77, "individu": 77, "induc": 100, "inf": [17, 60, 77], "infer": [18, 77], "info": 100, "inform": [2, 22, 46, 60, 77, 84, 87, 91], "inherit": 0, "initi": [4, 5, 7, 9, 10, 11, 15, 39, 49, 65, 66, 67, 77, 82, 107], "initial_env": 101, "initialis": [92, 107], "inner": [77, 92], "innermost": 77, "inplac": [6, 16, 48, 61, 65, 66, 67, 100, 102], "inpt": [23, 86, 87, 91], "input": [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 22, 23, 24, 30, 31, 32, 33, 38, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 77, 80, 81, 87, 90, 91, 92, 100, 101, 102, 103, 107], "input2": 77, "input3": 77, "input_idx": 21, "input_tensor": 77, "insert": [4, 73, 89, 90], "insid": 100, "inspect": [89, 90], "instanc": [7, 9, 65, 66, 67, 77, 91], "instead": [3, 47, 65, 66, 67, 77], "instruct": [104, 107], "instrument": 101, "int": [2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21, 24, 37, 39, 41, 42, 43, 45, 46, 47, 49, 51, 59, 63, 65, 66, 67, 74, 76, 77, 82, 84, 100, 101, 103, 107], "int16": 77, "int32": 77, "int64": [47, 77], "int8": 77, "int_repr": 77, "integ": [2, 9, 45, 77, 100], "integr": 77, "intend": [23, 65, 66, 67, 87, 91, 99, 100], "interfac": [24, 92, 94, 103], "intermedi": [13, 60, 65, 66, 67, 91, 101], "intermediari": 100, "intern": [22, 46, 89, 90, 91], "interoper": [92, 94], "interpol": [36, 77], "interpret": 101, "interv": 77, "introduc": [85, 107], "introduct": 107, "inttensor": [7, 77], "invalid": 77, "invers": [7, 49, 77], "investig": 107, "invoc": 101, "involv": 107, "ipu": 77, "is_caus": [12, 19, 60], "is_coalesc": 77, "is_complex": 77, "is_conj": 77, "is_contigu": 77, "is_cpu": 77, "is_cuda": 77, "is_floating_point": 77, "is_infer": 77, "is_ipu": 77, "is_leaf": 77, "is_meta": 77, "is_mp": 77, "is_neg": 77, "is_pin": 77, "is_quant": 77, "is_set_to": 77, "is_shar": 77, "is_sign": 77, "is_spars": 77, "is_sparse_csr": 77, "is_xla": 77, "is_xpu": 77, "isclos": 77, "isfinit": 77, "isinf": 77, "isn": 77, "isnan": 77, "isneginf": 77, "isposinf": 77, "isreal": 77, "issu": [105, 107], "issuecom": 77, "istft": 77, "item": [3, 73, 77], "items": 77, "iter": [4, 65, 66, 67, 71, 73, 77, 88, 94], "its": [2, 38, 54, 65, 66, 77, 81, 84, 85, 96, 100, 104, 107], "itself": [65, 66, 67, 77], "iw": 46, "j": [54, 77], "jit": 107, "jump": 77, "just": [18, 21, 65, 66, 67, 77, 100, 101, 107], "jvp": 100, "k": [2, 3, 10, 11, 47, 54, 73, 77], "keen": [104, 105, 107], "keep": [77, 107], "keep_type_expr": 96, "keepdim": [37, 77], "kei": [18, 60, 73, 77, 84, 91, 92, 104], "kept": 60, "kernel": [2, 46, 60, 77, 107], "kernel_s": 2, "keyerror": 73, "keyword": [24, 60, 77, 101, 103], "know": [77, 100], "known": [2, 16, 61, 105], "kron": 77, "kthvalu": 77, "kw": 46, "kwarg": [38, 65, 66, 67, 75, 77, 96, 100, 101], "l": [2, 3, 60], "l2": [65, 67], "l_": 2, "l_1": 3, "l_n": 3, "label": [3, 21, 24, 103], "label_smooth": [3, 47], "lack": [18, 73], "laid": 77, "lambd": 77, "lambda": [65, 66, 67, 77], "languag": 104, "larg": 107, "larger": [7, 49, 56, 57, 58, 77, 92, 107], "largest": 77, "last": [3, 5, 9, 10, 11, 51, 59, 73, 77, 85], "late": 77, "later": [16, 61, 77], "latest": 107, "layer": [2, 5, 9, 10, 11, 12, 13, 15, 18, 19, 39, 51, 58, 65, 66, 67, 82, 85, 92, 107], "layer_norm": [9, 104], "layernorm": [51, 104], "layout": [52, 53, 54, 77, 101], "lcm": 77, "lcm_": 77, "ldexp": 77, "ldexp_": 77, "ldot": [9, 65, 66, 67, 77], "le": 77, "le_": 77, "lead": 100, "leaf": 77, "leak": 100, "learn": [7, 9, 10, 11, 16, 61, 65, 66, 67, 71, 107], "learnabl": [2, 7, 9, 10, 11, 15], "least": [54, 77], "leav": [65, 66, 67, 77, 107], "left": [2, 31, 33, 54, 60, 77, 107], "left_grad_scal": [31, 33], "leftarrow": [65, 66, 67], "legaci": 106, "len": 77, "length": [2, 21, 24, 46, 77, 103], "lerp": 77, "lerp_": 77, "less": 77, "less_": 77, "less_equ": 77, "less_equal_": 77, "let": 77, "letter": 0, "level": [77, 107], "lfloor": 2, "lgamma": 77, "lgamma_": 77, "li": 17, "lib": 102, "librari": [34, 91, 92, 104, 105, 107], "lie": [17, 63], "life": 77, "lifetim": 77, "lifo": 73, "lift": 77, "like": [4, 5, 19, 65, 66, 67, 73, 77, 100, 107], "limit": [36, 60, 91, 104], "linalg": 77, "line": [23, 24, 103, 107], "linear": [8, 11, 16, 50, 53, 61, 62, 77, 89, 90, 102, 104, 107], "linear_1": [102, 107], "linear_1_bia": 107, "linear_1_weight": 107, "linear_2": 107, "linear_2_bia": 107, "linear_2_weight": 107, "linear_readout": 104, "linearreadout": [67, 104], "link": 2, "linux": 102, "list": [4, 5, 7, 9, 49, 65, 66, 67, 77, 94, 101, 105], "liter": [2, 10, 11, 14, 74, 77], "literatur": 90, "live": 77, "llama3": 60, "ln": 77, "load": [65, 66, 67, 77], "load_state_dict": [65, 66, 67, 77], "loaded_weight": 77, "local": [18, 102], "locat": 77, "log": [3, 77, 107], "log10": 77, "log10_": 77, "log1p": 77, "log1p_": 77, "log2": 77, "log2_": 77, "log_": 77, "log_normal_": 77, "logaddexp": 77, "logaddexp2": 77, "logarithm": 36, "logarithmic_interpol": 104, "logcumsumexp": 77, "logdet": 77, "logic": 54, "logical_and": 77, "logical_and_": 77, "logical_not": [60, 77], "logical_not_": 77, "logical_or": 77, "logical_or_": 77, "logical_xor": 77, "logical_xor_": 77, "logist": [16, 61, 62], "logit": [3, 47, 77], "logit_": 77, "logsoftmax": 3, "logsumexp": 77, "long": [3, 77], "longer": [0, 77], "longtensor": [7, 49, 77], "look": [7, 49, 54, 65, 66, 67, 77, 101, 107], "lookup": [7, 49], "loop": [65, 66, 67], "lose": 77, "loss": [3, 23, 24, 47, 65, 66, 67, 86, 87, 91, 103, 107], "loss_fn": 67, "low": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63, 77, 104, 107], "lower": [36, 46, 60, 65, 66, 67], "lr": [64, 65, 66, 67, 68, 69, 70, 71, 77], "lr_scale_for_depth": 104, "lr_scale_func": 71, "lr_scale_func_adam": 104, "lr_scale_func_sgd": [71, 104], "lt": 77, "lt_": 77, "lu": 77, "lu_data": 77, "lu_pivot": 77, "lu_solv": 77, "m": [2, 6, 8, 10, 11, 16, 17, 54, 85, 89, 90, 91, 92, 94], "m_": [65, 66], "m_0": [65, 66], "m_t": [65, 66], "made": [65, 66, 67, 100], "mai": [0, 2, 3, 24, 46, 52, 53, 54, 58, 60, 65, 66, 67, 77, 85, 87, 91, 92, 100, 103, 104, 105, 107], "main": 77, "maintain": 57, "make": [2, 23, 46, 60, 77, 92, 94, 97, 107], "make_du": 100, "manag": [60, 77], "mani": [77, 92, 100, 107], "manner": 77, "mantissa": 77, "mantissa_bit": [41, 42, 43], "manual": [3, 5, 47, 65, 66, 67, 77, 85, 92, 107], "map": [65, 66, 67, 77, 101], "map_": 77, "map_nodes_to_valu": 101, "mark": [77, 100], "mark_dirti": 100, "mark_non_differenti": 100, "mask": [12, 18, 19, 21, 60, 77], "masked_fil": 77, "masked_fill_": [60, 77], "masked_grad": 77, "masked_scatt": 77, "masked_scatter_": 77, "masked_select": 77, "mat": 77, "mat1": 77, "mat2": 77, "match": [60, 65, 66, 67, 77], "materi": 100, "math": [60, 102], "mathbb": 3, "mathcal": [2, 7, 10, 11], "mathrm": [9, 37, 65, 66], "matmul": [77, 89, 90, 104], "matplotlib": [23, 24, 103], "matric": 77, "matrix": [7, 49, 54, 60, 77], "matrix_exp": 77, "matrix_pow": 77, "matter": 100, "max": [23, 65, 66, 77, 107], "max_absolute_valu": 41, "max_norm": [7, 49], "maxim": [65, 66, 67, 104], "maximum": [23, 24, 41, 49, 77, 103], "maxnorm": 77, "mean": [3, 6, 8, 9, 10, 11, 15, 16, 17, 23, 26, 28, 29, 37, 47, 52, 53, 55, 77, 85, 107], "mean_ab": [23, 84], "meantim": [3, 47], "measur": [55, 107], "mechan": [60, 107], "median": 77, "meet": 107, "memori": [60, 65, 66, 67, 77, 100, 107], "memory_format": [77, 101], "mention": 77, "merit": 107, "met": 77, "meta": [77, 91], "metadata": [65, 66, 67, 75], "meth": 76, "method": [4, 5, 24, 65, 71, 73, 77, 85, 86, 87, 89, 90, 91, 94, 100, 101, 102, 103, 104, 107], "metric": [20, 22, 23, 91, 104], "mh": 77, "mhsa": 104, "mhsa_tau": 19, "micikeviciu": 90, "might": [65, 66, 67, 77, 107], "min": [23, 77, 107], "min_absolute_norm": 41, "min_absolute_subnorm": 41, "mind": 76, "mini": [7, 9, 49], "minibatch": [3, 46, 47], "minim": [65, 66, 67], "minimum": [23, 24, 41, 77, 103], "minlength": 77, "minu": 77, "mirror": [0, 77], "miss": [52, 53, 54, 105], "mixtur": [3, 47], "mkl": 77, "mkldnn": 77, "mlp": [19, 39, 82, 102, 104, 107], "mlp_tau": 19, "mm": [54, 77], "mod": 97, "mode": [2, 9, 46, 60, 77, 100], "model": [5, 20, 23, 24, 58, 65, 66, 67, 71, 86, 87, 89, 90, 91, 92, 98, 103, 104], "modestli": [65, 66, 67], "modif": [0, 4, 5, 77, 100], "modifi": [49, 65, 66, 67, 77, 100], "modul": [1, 2, 4, 5, 6, 7, 9, 10, 11, 15, 18, 22, 23, 24, 27, 34, 49, 54, 60, 77, 83, 85, 86, 87, 89, 90, 91, 92, 94, 95, 97, 101, 102, 103, 107], "module_load": 77, "modulelist": [4, 5], "moduletyp": 102, "moment": [65, 66], "momentum": 67, "momentum_buff": [65, 66, 67], "more": [2, 7, 46, 49, 60, 61, 63, 65, 66, 67, 77, 91, 100, 105, 107], "more_img": 77, "moreov": 77, "most": [60, 77, 100, 105, 107], "motiv": 107, "move": [73, 77, 85, 94, 106], "move_to_end": 73, "moveaxi": 77, "movedim": 77, "mp": 77, "mse_loss": 104, "mseloss": 55, "msort": 77, "mt": 77, "mtia": 77, "mu": [67, 77], "much": [77, 92, 107], "mul": 77, "mul_": 77, "mult": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "multi": [12, 19], "multinomi": 77, "multipl": [3, 47, 54, 77, 80, 81, 87, 107], "multipli": [2, 3, 8, 12, 16, 17, 47, 50, 54, 60, 61, 62, 63, 77, 107], "multiply_": 77, "mup": [64, 65, 66, 67, 71], "mup_scaling_depth": [14, 71, 74], "mup_typ": [14, 71, 74], "must": [2, 8, 10, 11, 16, 17, 27, 36, 38, 45, 46, 50, 52, 53, 54, 60, 61, 63, 77, 85, 86, 87, 100, 107], "mv": 77, "mvlgamma": 77, "mvlgamma_": 77, "mymodel": 60, "n": [2, 3, 7, 9, 17, 47, 54, 60, 77, 101], "n_fft": 77, "n_i": 2, "nabla_": [65, 66, 67], "naiv": 107, "name": [2, 8, 10, 11, 16, 17, 21, 24, 27, 38, 45, 46, 50, 52, 53, 54, 61, 63, 77, 101, 103], "named_img": 77, "named_tensor": 77, "nan": 77, "nan_to_num": 77, "nan_to_num_": 77, "nanmean": 77, "nanmedian": 77, "nanquantil": 77, "nansum": 77, "narrow": 77, "narrow_copi": 77, "natur": 60, "nbyte": 77, "ndarrai": 77, "ndim": 77, "ndimens": 77, "ne": 77, "ne_": 77, "nearli": 77, "necessari": [58, 77, 86, 100, 107], "necessarili": [3, 77], "need": [3, 46, 58, 77, 85, 91, 100, 102, 107], "needs_input_grad": 100, "neg": 77, "neg_": 77, "negative_": 77, "neginf": 77, "negligibli": 87, "neither": 100, "nelement": 77, "neq": [65, 67], "nest": [60, 77, 85, 92, 94], "nesterov": 67, "network": [6, 11, 16, 53, 61, 65, 66, 67], "neural": [6, 16, 61], "neuron": 6, "never": [77, 87], "nevertheless": 0, "new": [65, 66, 67, 73, 77, 85, 89, 90, 91, 94, 95, 96, 104, 105, 107], "new_arg": [65, 66, 67], "new_empti": 77, "new_empty_strid": 77, "new_ful": 77, "new_gelu": 92, "new_kwarg": [65, 66, 67], "new_on": 77, "new_tensor": 77, "new_zero": 77, "newli": 7, "next": [77, 107], "nextaft": 77, "nextafter_": 77, "nice": [2, 77], "nightli": 107, "nllloss": 3, "nlp": 9, "nn": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 22, 23, 24, 44, 49, 60, 65, 66, 67, 72, 74, 75, 77, 85, 89, 90, 92, 94, 95, 97, 100, 101, 102, 103, 107], "nnz": 77, "no_grad": [7, 65, 66, 67, 77], "node": [86, 87, 88, 91, 96, 101], "non": [3, 7, 12, 19, 40, 41, 47, 49, 54, 58, 60, 71, 77, 87, 100, 107], "non_block": 77, "non_recurse_funct": 94, "nondeterminist": [2, 46, 60, 77], "none": [2, 3, 4, 7, 8, 9, 10, 11, 14, 16, 17, 19, 23, 27, 37, 38, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 65, 66, 67, 70, 71, 73, 74, 77, 96, 97, 99, 100, 101, 102, 107], "nonlinear": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "nonzero": 77, "nonzero_stat": 77, "nor": 100, "norm": [2, 7, 10, 11, 14, 49, 74, 77], "norm_typ": [7, 49], "normal": [9, 15, 41, 51, 59, 77, 107], "normal_": 77, "normalis": [15, 58], "normalized_shap": [9, 15, 51, 59], "not_equ": 77, "not_equal_": 77, "note": [0, 2, 3, 4, 5, 7, 9, 10, 11, 15, 18, 46, 47, 49, 54, 60, 65, 66, 67, 77, 85, 91, 92, 94, 100, 106, 107], "notebook": [104, 107], "notic": [52, 53, 54], "noun": 90, "now": [24, 77, 91, 95, 100, 103, 107], "nse": 77, "num_channel": 77, "num_embed": 7, "num_sampl": 77, "number": [2, 3, 8, 10, 11, 12, 16, 17, 18, 19, 21, 32, 40, 41, 45, 46, 47, 49, 51, 52, 53, 59, 77, 107], "number_of_heads_kei": 60, "number_of_heads_key_valu": 60, "number_of_heads_queri": 60, "number_of_heads_valu": 60, "numel": [23, 77, 84], "numer": [9, 15, 23, 24, 49, 65, 66, 83, 89, 90, 103, 107], "numerical_accuraci": 60, "numpi": [77, 100], "object": [14, 65, 66, 67, 73, 74, 75, 77, 84, 91, 100], "observ": [3, 47, 77], "obtain": 77, "occasion": [104, 107], "occur": [65, 66, 67, 77, 107], "occurr": 77, "od": 73, "odd": 46, "offset": 77, "often": [7, 49, 77, 107], "old": [77, 107], "omit": 77, "onc": [88, 100], "once_differenti": 100, "one": [2, 8, 10, 11, 16, 17, 27, 38, 45, 46, 50, 52, 53, 54, 58, 60, 61, 63, 65, 66, 67, 77, 85, 100, 107], "ones": [7, 9, 15, 49, 60, 77], "onesid": 77, "onli": [3, 9, 24, 30, 31, 32, 33, 41, 47, 54, 60, 68, 76, 77, 80, 81, 85, 86, 89, 90, 91, 94, 100, 101, 103, 105, 107], "op": [30, 31, 32, 33, 65, 66, 67, 77, 85, 89, 90, 100, 101, 105, 107], "open": [52, 53, 54], "oper": [2, 6, 9, 15, 23, 24, 25, 46, 48, 52, 53, 54, 60, 63, 77, 79, 87, 92, 100, 102, 103, 105, 107], "opoverload": 101, "oppos": 100, "opt": 102, "optim": [3, 60, 85, 94, 95, 104], "optimis": [85, 94, 95, 104], "option": [2, 3, 7, 8, 9, 10, 11, 12, 16, 17, 18, 19, 21, 23, 24, 27, 38, 39, 45, 46, 49, 50, 52, 53, 54, 56, 57, 58, 60, 61, 62, 63, 65, 66, 67, 71, 77, 82, 84, 87, 92, 94, 96, 100, 101, 102, 103], "order": [5, 58, 60, 65, 66, 67, 73, 77, 101, 107], "ordereddict": 5, "ordin": 77, "org": 19, "orgqr": 77, "origin": [3, 16, 47, 61, 77, 81, 101], "ormqr": 77, "other": [2, 5, 20, 45, 46, 54, 60, 65, 66, 67, 77, 91, 105, 107], "otherwis": [3, 65, 66, 67, 73, 77, 100], "our": [65, 66, 92, 94, 100, 104, 105, 107], "out": [0, 2, 6, 10, 11, 45, 46, 52, 53, 54, 56, 64, 77, 100, 104, 107], "out_channel": 2, "out_dim": 100, "out_featur": [10, 11], "outer": 77, "outermost": 77, "outlin": [104, 107], "output": [2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 23, 24, 30, 31, 32, 33, 38, 45, 46, 47, 49, 50, 52, 53, 54, 60, 61, 62, 63, 74, 77, 86, 100, 101, 102, 103, 107], "output_s": 77, "output_scal": [30, 31, 32, 33, 38, 107], "outsid": 107, "over": [3, 5, 9, 15, 46, 47, 56, 57, 58, 60, 65, 66, 67, 77, 92, 100, 107], "overflow": [63, 107], "overhead": 107, "overlap": 77, "overload": 77, "overrid": [3, 47, 100], "overridden": [71, 100], "overview": [104, 107], "own": [2, 92, 94], "p": [6, 7, 14, 48, 49, 54, 60, 74, 77], "packag": 102, "pad": [2, 7, 12, 19, 46, 49, 77], "pad_mod": 77, "padding_idx": [7, 49], "padding_mod": 2, "padw": 46, "pair": [73, 77, 99], "panda": 22, "paper": [6, 9, 15, 39, 65, 66, 82, 92, 104, 107], "paradigm": 107, "parallel": 60, "param": [65, 66, 67, 68, 69, 71, 77], "param_group": [65, 66, 67, 71], "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 36, 38, 39, 45, 46, 47, 48, 49, 50, 52, 53, 54, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 71, 80, 81, 82, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 101, 102, 103, 104], "parameter": 77, "parameterdata": [14, 68, 69, 70, 71, 74, 78], "parametr": 104, "paramst": 71, "part": 60, "partial": 101, "particular": [54, 77, 92, 107], "particularli": [3, 107], "partit": 60, "pass": [5, 7, 22, 23, 24, 41, 58, 60, 65, 66, 67, 71, 76, 77, 79, 80, 81, 84, 89, 90, 91, 92, 95, 99, 100, 101, 102, 103, 107], "patch": 95, "patch_to_expand_modul": 104, "path": [21, 24, 103], "pd": 22, "pdf": 77, "peak": [65, 66, 67], "penalti": [65, 67], "pep": 76, "per": [3, 9, 15, 47, 65, 66, 67, 71, 77, 100], "perform": [2, 3, 5, 46, 60, 63, 65, 66, 67, 71, 77, 100, 107], "permut": 77, "perspect": [23, 24, 103, 107], "phi": [8, 50], "physic": 77, "pi": [8, 50, 77], "pick": 77, "pin": 77, "pin_memori": 77, "pinvers": 77, "pip": [104, 107], "pivot": 77, "pixel": 3, "place": [6, 48, 49, 77, 100, 107], "placehold": 101, "plain": 102, "plane": [2, 9, 46], "pleas": [52, 53, 54, 60, 65, 66, 77, 100, 106, 107], "plot": [22, 24, 91, 103, 104], "plot_kwarg": [24, 103], "point": [2, 3, 23, 24, 41, 49, 60, 77, 86, 87, 91, 103, 107], "pointer": 77, "poll": 77, "polygamma": 77, "polygamma_": 77, "pop": 73, "popitem": 73, "popul": [77, 101], "posinf": 77, "posit": [2, 3, 18, 77, 101], "position": 77, "possibl": [0, 49, 77, 107], "possibli": [22, 23], "post": [12, 18, 19, 65, 66, 67, 77, 106], "potenti": [2, 46, 60, 77], "pow": 77, "pow_": 77, "power": [46, 52], "practic": [23, 77, 91, 95, 107], "pre": [65, 66, 67, 101, 107], "precis": [2, 10, 11, 54, 60, 104, 107], "predict": [47, 77], "prefer": 60, "prenorm": 19, "prepend": [54, 65, 66, 67, 77], "preprocess": 77, "presenc": 76, "present": [73, 77, 96, 104, 107], "preserv": [77, 107], "preserve_format": 77, "pretrain": 7, "pretrainedtokenizerbas": [21, 24, 103], "prevent": [6, 63, 77, 100], "preview": 107, "previou": [77, 87, 92], "primarili": 76, "print": [10, 11, 102, 107], "prior": [58, 60, 100], "prioriti": 92, "probabl": [3, 6, 12, 18, 19, 47, 48, 60, 77], "problem": [3, 104, 107], "proce": 92, "procedur": 92, "process": [3, 7, 47, 65, 66, 67, 86, 92, 101], "process_input": 101, "process_output": 101, "prod": 77, "produc": [2, 77, 107], "product": [54, 60, 77], "profil": 77, "program": [77, 107], "progress": [65, 66, 67, 77], "prohibit": [65, 66, 67], "promot": 45, "promptli": 101, "proof": 92, "propag": 77, "properli": 4, "properti": [41, 77], "proportion": 77, "proto": 76, "protocol": [14, 74, 75, 78], "proven": 6, "provid": [3, 5, 24, 26, 28, 29, 30, 31, 32, 33, 60, 64, 65, 66, 67, 73, 77, 87, 91, 92, 103, 105, 107], "prune": [23, 24, 86, 87, 88, 103], "prune_non_float_tensor": [91, 104], "prune_same_scal": 23, "prune_same_scale_tensor": [91, 104], "prune_selected_nod": 104, "pruned_graph": [86, 87], "pull": 77, "purpos": [54, 77, 89, 90, 107], "push": 107, "put": [77, 107], "put_": 77, "py": 102, "python": [4, 77, 102], "python3": 102, "pytorch": [0, 20, 60, 77, 85, 92, 107], "q": 77, "q_per_channel_axi": 77, "q_per_channel_scal": 77, "q_per_channel_zero_point": 77, "q_scale": 77, "q_zero_point": 77, "qr": 77, "qscheme": 77, "qtensor": 77, "quad": 3, "qualifi": 101, "quantil": 77, "quantis": [41, 89, 90], "quantise_bwd": 41, "quantise_fwd": 41, "quantiz": 77, "queri": 60, "queu": 77, "r": [7, 49, 77, 100], "race": 77, "rad2deg": 77, "rad2deg_": 77, "rais": [27, 60, 73, 77, 100], "rand": [49, 60, 77], "randint": [47, 77], "randn": [2, 3, 6, 8, 9, 10, 11, 16, 17, 45, 46, 47, 77, 102, 107], "random": [2, 21, 46, 60, 77, 100], "random_": [3, 77], "rang": [3, 17, 24, 63, 77, 85, 89, 90, 92, 101, 103, 107], "rank": 77, "rate": [65, 66, 67, 71], "rather": 85, "ratio": [36, 39, 56, 57, 58, 77, 82, 107], "ravel": 77, "re": [24, 63, 77, 103, 104, 105, 107], "read": [77, 107], "readout_constraint": [67, 70], "real": 77, "realli": 101, "realloc": 77, "reason": [60, 91, 105], "receiv": [65, 66, 67, 107], "recent": 107, "reciproc": 77, "reciprocal_": 77, "recogn": 76, "recommend": [24, 28, 77, 103, 104, 107], "recomput": 100, "record": [77, 91, 100, 101], "record_stream": 77, "rectifi": 107, "recurs": [92, 94, 95, 97, 101, 102], "recurse_modul": 102, "reduc": [3, 47, 55, 77, 89, 90], "reduct": [3, 47, 55, 77], "reevalu": [65, 66, 67], "refer": [65, 66, 77, 104, 106, 107], "referenc": 101, "refin": 77, "refine_nam": 77, "reflect": [2, 77, 102, 106, 107], "regard": [7, 49, 65, 66, 107], "regist": [4, 5, 65, 66, 67, 77], "register_hook": 77, "register_load_state_dict_post_hook": [65, 66, 67], "register_load_state_dict_pre_hook": [65, 66, 67], "register_post_accumulate_grad_hook": 77, "register_state_dict_post_hook": [65, 66, 67], "register_state_dict_pre_hook": [65, 66, 67], "register_step_post_hook": [65, 66, 67], "register_step_pre_hook": [65, 66, 67], "regular": [4, 6, 66, 71, 92], "reinforc": [16, 61], "reinterpret": 77, "rel": [13, 19, 39, 56, 58, 77, 82, 87], "releas": [77, 85, 104, 107], "relev": [0, 9, 23, 24, 71, 103], "reli": [77, 92], "relu": [5, 102], "relu1": 5, "relu2": 5, "remain": [7, 49], "remaind": 77, "remainder_": 77, "remap": 77, "rememb": 73, "remov": [54, 65, 66, 67, 73, 77, 87, 96, 107], "removablehandl": [65, 66, 67], "removeablehandl": [65, 66, 67], "renam": 77, "rename_": 77, "rename_map": 77, "renamed_img": 77, "renorm": [7, 49, 77], "renorm_": 77, "repeat": [77, 107], "repeat_interleav": [60, 77], "replac": [0, 77, 92, 96, 107], "replace_node_with_funct": 104, "replic": 2, "repo": [104, 107], "report": 101, "repres": [23, 24, 77, 84, 88, 91, 92, 94, 96, 99, 100, 101, 102, 103, 107], "represent": [41, 77, 89, 90], "request": [52, 53, 54, 65, 66, 67, 105], "requir": [2, 3, 60, 77, 91, 92, 100, 107], "requires_grad": [3, 7, 47, 77, 100], "requires_grad_": [77, 91, 102, 107], "rescal": [3, 17, 47], "reset": [65, 66, 67], "reshap": [23, 77, 87], "reshape_a": 77, "resid": 77, "residu": [18, 19, 39, 56, 57, 58, 82, 92, 107], "residual_add": [57, 58, 104, 107], "residual_appli": 104, "residual_attn_ratio": [39, 82], "residual_mult": [39, 82], "residual_sc": 18, "residual_split": [56, 57, 104, 107], "residuallay": 107, "resiz": 77, "resize_": 77, "resize_as_": 77, "resolve_conj": 77, "resolve_neg": 77, "respect": [49, 65, 66, 67, 77, 100], "rest": 77, "restrict": [3, 54], "result": [22, 23, 56, 58, 77, 91, 101], "retain": 96, "retain_grad": 77, "retain_graph": 77, "retains_grad": 77, "rethink": [3, 47], "retriev": [7, 27, 49, 77, 100, 101], "return": [3, 5, 14, 17, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 36, 38, 39, 43, 47, 54, 56, 58, 60, 62, 63, 65, 66, 67, 71, 73, 74, 76, 77, 80, 81, 82, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 100, 101, 102, 103, 107], "return_complex": 77, "return_count": 77, "return_invers": 77, "reus": 77, "revers": 77, "rfloor": 2, "right": [2, 31, 33, 54], "right_grad_scal": [31, 33], "rm": [15, 59, 104], "rms_norm": 104, "rmsnorm": [59, 104], "rocm": [2, 10, 11, 54], "roll": 77, "root": 15, "rot90": 77, "roughli": 107, "round": [41, 77, 89, 90], "round_": 77, "rounding_mod": 77, "routin": 77, "row": [23, 24, 49, 77, 103], "row_indic": 77, "rsqrt": 77, "rsqrt_": 77, "rtol": [77, 87], "rule": [39, 60, 64, 65, 66, 67, 77, 82], "run": [5, 22, 23, 60, 65, 66, 67, 77, 89, 90, 101, 104, 107], "run_nod": 101, "runtim": 76, "runtime_check": 76, "runtimeerror": [77, 100], "s0": 77, "s1": 77, "safe": [65, 66, 77], "safeti": 77, "sai": 77, "said": 92, "sake": [4, 5, 83, 91], "same": [2, 3, 5, 6, 8, 9, 10, 11, 16, 17, 39, 46, 47, 54, 60, 62, 77, 82, 87, 89, 90, 100, 107], "sampl": [2, 3, 6, 7, 10, 11, 21, 47, 48, 49, 60, 77], "satisfi": 77, "save": [65, 66, 67, 77, 100], "save_for_backward": 100, "save_for_forward": 100, "saved_tensor": 100, "saved_tensors_hook": 100, "saved_weight": 77, "scalar": [3, 9, 54, 77, 80, 81, 99, 107], "scale": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 77, 82, 83, 85, 86, 87, 89, 90, 91, 92, 98, 102, 103, 105], "scale_bwd": 104, "scale_channel": 77, "scale_elementwis": 104, "scale_factor": 60, "scale_fwd": 104, "scale_grad_by_freq": [7, 49], "scale_pow": [46, 52], "scaled_dot_product_attent": [89, 90, 104], "scaled_paramet": [64, 104], "scaledmlp": 107, "scalepair": [100, 104], "scales_graph": [22, 23, 24, 86, 87, 91, 103], "scaletrack": 104, "scaletrackinginterpret": 104, "scatter": 77, "scatter_": 77, "scatter_add": 77, "scatter_add_": 77, "scatter_reduc": 77, "scatter_reduce_": 77, "scenario": 107, "scheme": [18, 39, 77, 82], "scope": 107, "score": 60, "scratch": 107, "script": 107, "sdpa_kernel": 60, "sdpbackend": 60, "second": [5, 7, 54, 65, 66, 100], "secondli": 107, "section": [47, 77, 104, 107], "see": [2, 3, 7, 8, 10, 11, 16, 17, 18, 19, 38, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 76, 77, 92, 100, 101, 104, 105, 106, 107], "seed": 21, "seen": [23, 24, 77, 103, 107], "select": [2, 30, 31, 32, 33, 46, 60, 77], "select_scatt": 77, "self": [4, 12, 16, 19, 60, 61, 65, 66, 67, 76, 77, 101, 102, 107], "semant": [77, 101], "sens": 107, "sentence_length": 9, "separ": 107, "seq": 77, "seq_len": [21, 24, 103], "sequenc": [2, 5, 12, 19, 21, 24, 51, 77, 103], "sequenti": [5, 18], "set": [2, 3, 6, 9, 10, 11, 15, 39, 46, 47, 48, 60, 65, 66, 67, 71, 73, 77, 82, 84, 88, 91, 92, 100, 107], "set_": 77, "set_materialize_grad": 100, "set_to_non": [65, 66, 67], "setdefault": 73, "setup_context": 100, "sever": 46, "sgd": [64, 70, 77, 104], "sgn": 77, "sgn_": 77, "shallow": [65, 66, 67, 73], "shape": [2, 3, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 45, 46, 47, 49, 50, 52, 53, 60, 61, 62, 63, 77, 100], "share": [77, 100, 107], "share_memory_": 77, "sharper": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "shift": [21, 77], "short": 77, "shorthand": 77, "should": [3, 17, 38, 46, 47, 56, 58, 60, 65, 66, 67, 75, 77, 89, 90, 91, 92, 94, 95, 100, 101, 102, 107], "show": [23, 104, 107], "show_arrow": 23, "show_error_bar": 23, "show_zero_tensor": 23, "shown": [9, 23], "shrink": 107, "shuffl": 21, "shuffle_buffer_s": 21, "side": [2, 46, 77], "sigma": [16, 61, 62, 77], "sigmoid": [16, 61, 62, 77], "sigmoid_": 77, "sign": [77, 107], "sign_": 77, "signal": [2, 46], "signatur": [65, 66, 67, 76, 77, 100], "signbit": 77, "signific": [87, 107], "significantli": [65, 66, 67, 107], "silu": [62, 104], "silu_glu": 104, "similar": [77, 100, 107], "simpl": [76, 77], "simplefunc": 100, "simplest": 2, "simpli": [6, 107], "simul": [40, 77, 89, 90], "simulate_format": 104, "simulate_fp8": [85, 91, 92, 104, 105], "sin": 77, "sin_": 77, "sinc": [65, 66, 67, 77], "sinc_": 77, "singl": [2, 3, 5, 9, 46, 57, 65, 66, 67, 77, 85], "singleton": [9, 77], "sinh": 77, "sinh_": 77, "site": 102, "situat": 77, "size": [2, 3, 7, 9, 10, 11, 12, 13, 18, 19, 21, 24, 47, 49, 60, 77, 100, 103], "size_averag": [3, 47, 55], "sizedim": 77, "sizeof": [65, 66, 67], "skip": [19, 56, 57, 58, 65, 66, 67, 100, 107], "slice": [17, 63, 77, 87, 101], "slice_scatt": 77, "slide": 104, "slightli": [85, 87], "slogdet": 77, "small": [5, 107], "smaller": [77, 107], "smm": 77, "smooth": [3, 47], "so": [2, 17, 46, 60, 63, 65, 66, 67, 77, 85, 91, 92, 94, 100, 101, 102, 107], "softmax": [3, 12, 18, 19, 47, 60, 77, 104, 107], "solv": 107, "some": [0, 2, 3, 46, 47, 52, 53, 54, 60, 65, 66, 67, 77, 92, 104, 107], "some_comm_op": 77, "someth": [65, 66, 67, 107], "sort": [77, 100], "sound": 5, "sourc": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 36, 37, 38, 39, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 99, 100, 101, 102, 103], "sp": 104, "space": [2, 36, 46], "span": [3, 77], "spars": [7, 17, 49, 52, 53, 54, 77], "sparse_bsc": 77, "sparse_bsr": 77, "sparse_coo": 77, "sparse_coo_tensor": 77, "sparse_csc": 77, "sparse_csr": 77, "sparse_dim": 77, "sparse_mask": 77, "sparse_resize_": 77, "sparse_resize_and_clear_": 77, "sparsecsr": 77, "sparsedim": 77, "spatial": 9, "special": [77, 92, 105], "specif": [9, 60, 65, 66, 67, 77, 89, 90, 101], "specifi": [3, 7, 47, 49, 60, 63, 65, 66, 67, 73, 77, 87, 91, 100, 107], "speed": 107, "speedup": [89, 90, 107], "split": [46, 58, 77], "split_siz": 77, "split_size_or_sect": 77, "sqrt": [2, 8, 9, 10, 11, 15, 37, 50, 60, 65, 66, 77], "sqrt_": 77, "squar": [15, 55, 60, 65, 66, 77], "square_": 77, "squeez": 77, "squeeze_": 77, "srbit": 41, "src": 77, "sspaddmm": 77, "stabil": [9, 15, 65, 66], "stack": [39, 82, 107], "stage": 92, "stai": [77, 107], "standard": [0, 9, 17, 24, 40, 77, 85, 90, 92, 94, 99, 100, 101, 102, 103, 104, 107], "star": 2, "start": [39, 77, 82, 101, 107], "start_dim": 77, "state": [65, 66, 67, 77], "state_dict": [65, 66, 67], "static": [76, 100], "static_s": 77, "staticmethod": 100, "statist": 9, "std": [23, 77, 84], "steadili": 107, "step": [65, 66, 67, 71, 77, 107], "stft": 77, "still": [58, 77, 85, 92, 100, 105], "stochast": [41, 65, 89, 90], "storag": [77, 100], "storage_offset": 77, "storage_typ": 77, "store": [5, 7, 77, 101], "str": [2, 3, 8, 10, 11, 16, 17, 21, 23, 24, 27, 38, 41, 45, 46, 47, 50, 52, 53, 54, 55, 61, 63, 65, 66, 67, 70, 71, 77, 88, 101, 102, 103], "stream": 77, "strictli": 77, "stride": [2, 46, 77], "string": [2, 46, 77, 102], "strongli": 107, "structur": [76, 100, 101], "sub": 77, "sub_": 77, "subclass": [77, 100], "subject": [60, 77], "submodul": 5, "subnorm": 41, "subsequ": [2, 5], "subspac": 77, "substanti": 107, "substitut": 92, "subtract": 77, "subtract_": 77, "subtyp": 76, "success": 77, "successfulli": 85, "suffici": [77, 92, 107], "suitabl": 77, "sum": [3, 15, 17, 47, 63, 77], "sum_": [2, 3], "sum_j": [17, 63], "sum_to_s": 77, "super": [60, 102, 107], "suppli": [2, 86, 87, 89, 92, 94, 100, 107], "support": [0, 2, 3, 6, 7, 10, 11, 14, 24, 45, 46, 47, 48, 49, 52, 53, 54, 60, 65, 66, 67, 74, 75, 77, 78, 85, 92, 94, 100, 103], "sure": 60, "suspect": 105, "svd": 77, "sw": 46, "swap": [0, 77, 107], "swap_tensor": 77, "swapax": 77, "swapaxes_": 77, "swapdim": 77, "swapdims_": 77, "swiglu": 13, "swish": [16, 61], "switch": [65, 66, 67], "symbool": 101, "symfloat": 101, "symint": 101, "sync": 77, "synchron": 77, "syntax_highlight": 102, "system": [92, 94], "t": [2, 7, 23, 24, 43, 46, 49, 52, 53, 65, 66, 67, 76, 77, 85, 86, 87, 92, 94, 95, 97, 100, 103, 105, 107], "t_": 77, "tabl": [7, 49], "tag": [71, 75], "take": [38, 60, 77, 92, 107], "take_along_dim": 77, "taken": [3, 107], "tall": 77, "tan": 77, "tan_": 77, "tanh": [8, 50, 77], "tanh_": 77, "target": [3, 47, 55, 67, 77, 88, 96, 101, 107], "target_fn": 96, "target_to_funct": 101, "tau": [39, 56, 57, 58, 67, 82, 107], "techniqu": [6, 107], "tell": 77, "temp_mask": 60, "temperatur": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63], "templat": [24, 103], "tend": [91, 107], "tensor": [2, 3, 7, 10, 11, 14, 15, 17, 21, 23, 24, 37, 38, 41, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 71, 74, 80, 81, 84, 86, 87, 91, 94, 99, 100, 101, 102, 103, 107], "tensor1": 77, "tensor2": 77, "tensor_split": 77, "tensorfloat32": [2, 10, 11, 46, 52, 53, 54], "tensorlist": [65, 66, 67], "term": [65, 66, 77, 107], "test": [92, 105, 107], "text": [2, 3, 7, 8, 9, 10, 11, 16, 17, 21, 45, 46, 47, 50, 61, 62, 63, 65, 66, 67, 77], "textbf": [65, 66, 67], "textit": [65, 66, 67], "texttt": 77, "th": 77, "than": [2, 7, 46, 49, 58, 60, 65, 66, 67, 77, 85, 91, 92, 101], "thei": [0, 5, 23, 24, 65, 66, 67, 76, 77, 92, 100, 103, 104, 107], "them": [7, 17, 63, 77, 92, 100, 101, 105], "themselv": 107, "theori": [77, 107], "thereaft": [85, 94], "therefor": [7, 49, 77], "theta": [65, 66, 67], "theta_": [65, 66, 67], "theta_0": [65, 66, 67], "theta_t": [65, 66, 67], "thi": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 21, 22, 23, 24, 27, 38, 39, 41, 45, 46, 48, 49, 50, 52, 53, 54, 58, 60, 61, 63, 65, 66, 67, 71, 73, 75, 77, 82, 85, 86, 87, 89, 90, 91, 92, 94, 95, 97, 100, 101, 103, 104, 105, 106, 107], "thin": 0, "think": 77, "thorough": 107, "those": [0, 3, 47, 77, 89, 90, 104, 107], "though": [54, 58, 77, 85, 94, 105, 107], "three": [9, 31, 33, 60], "through": [65, 66, 67, 77, 100, 101, 105, 107], "throw": 77, "thrown": [60, 77], "thu": [54, 107], "tile": 77, "time": [2, 9, 45, 54, 65, 66, 67, 77, 107], "titl": 23, "to_dens": 77, "to_grad_input_scal": [2, 8, 10, 11, 16, 17, 38, 46, 50, 52, 53, 61, 63, 104], "to_left_grad_scal": [45, 54, 104], "to_mkldnn": 77, "to_output_scal": [2, 8, 10, 11, 16, 17, 38, 45, 46, 50, 52, 53, 54, 61, 63, 104], "to_padded_tensor": 77, "to_right_grad_scal": [45, 54, 104], "to_spars": 77, "to_sparse_bsc": 77, "to_sparse_bsr": 77, "to_sparse_coo": 77, "to_sparse_csc": 77, "to_sparse_csr": 77, "togeth": [56, 107], "toggl": 102, "token": [18, 21, 24, 103], "told": 92, "toler": 87, "tolist": 77, "too": [3, 60, 77, 89, 90], "tool": [20, 107], "top": [3, 92], "topk": 77, "torch": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 22, 23, 24, 44, 45, 46, 47, 49, 54, 60, 63, 65, 66, 67, 69, 70, 71, 72, 74, 75, 77, 83, 85, 89, 90, 91, 92, 94, 95, 97, 100, 102, 103, 105, 107], "torch_doctest_autograd": 100, "torch_nn_modules_to_user_modul": [95, 104], "torchdynamo": [89, 90, 92, 94, 95, 97], "torchscript": [77, 107], "total": [39, 77, 82], "touch": 105, "trace": [77, 92], "tracer": 101, "track": [4, 5, 23, 24, 77, 86, 91, 103], "track_scal": [22, 23, 24, 86, 87, 103, 104], "tracked_model": [24, 103], "tradeoff": 77, "trail": 15, "train": [3, 6, 7, 9, 48, 49, 58, 60, 65, 66, 67, 104, 105, 107], "trainabl": [65, 66, 67], "transform": [5, 9, 10, 11, 18, 19, 22, 23, 24, 38, 39, 52, 53, 64, 65, 66, 67, 77, 82, 103, 104, 105, 107], "transformer_residual_scaling_rul": [18, 104], "transformerdecod": 104, "transformerlay": 104, "transpos": [60, 77], "transpose_": 77, "treat": [5, 9, 17, 77], "tri": [65, 66, 67, 77], "trial": 77, "triangular": 60, "triangular_solv": 77, "trick": 107, "tricki": 107, "trigger": 77, "tril": [60, 77], "tril_": 77, "trip": 107, "triu": 77, "triu_": 77, "trivial_subclass": 97, "trou": 2, "true": [2, 3, 6, 7, 9, 10, 11, 15, 23, 24, 46, 47, 48, 49, 60, 65, 66, 67, 71, 73, 77, 89, 90, 96, 100, 101, 102, 103], "true_divid": 77, "true_divide_": 77, "trunc": 77, "trunc_": 77, "truncat": 77, "trunk": 18, "truth": [3, 47], "try": [2, 46, 60, 65, 66, 67, 107], "tune": [65, 66, 67, 107], "tupl": [2, 15, 21, 27, 37, 42, 43, 46, 52, 58, 59, 65, 66, 67, 77, 96, 100, 101, 102, 107], "tuple_to_format": 104, "tutori": [100, 107], "twice": 77, "two": [2, 3, 30, 47, 54, 58, 60, 65, 66, 67, 77, 85, 100, 107], "type": [2, 7, 10, 11, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 36, 38, 39, 45, 46, 49, 56, 58, 60, 62, 63, 65, 66, 67, 71, 76, 77, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 101, 102, 103], "type_a": 77, "typedstorag": 77, "typeguard": 78, "typic": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63, 77, 95, 97, 107], "u": [0, 2, 10, 11, 14, 39, 64, 65, 66, 67, 71, 72, 74, 75, 82, 92, 102, 104, 106, 107], "uint8": 77, "uint8_t": 77, "ultim": 92, "unaffect": 77, "unbalanc": 3, "unbatch": 3, "unbias": 9, "unbind": 77, "unchang": [77, 80], "unclear": 77, "uncoalesc": 77, "unconstrain": 92, "undefin": [77, 100], "under": 49, "underflow": 107, "underli": [60, 77, 85], "underneath": 100, "underscor": 77, "understand": 107, "undesir": [2, 46, 60], "unexpectedli": 77, "unflatten": 77, "unfold": 77, "ungraph": [65, 66], "uniform": [3, 47, 77], "uniform_": 77, "uniniti": 77, "union": 102, "uniqu": 77, "unique_consecut": 77, "unit": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 25, 28, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 71, 83, 85, 89, 90, 91, 92, 98, 102, 105], "unit_sc": [0, 104, 105, 107], "unit_scal": [85, 91, 104, 105], "unit_scaled_funct": 107, "unitriangular": 77, "unitscaledmodul": 107, "unless": [77, 100], "unlik": [9, 23, 24, 77, 103, 107], "unment": 77, "unnam": 77, "unnorm": [3, 47], "unreduc": 3, "unsafe_chunk": 77, "unsafe_split": 77, "unscal": 107, "unscaledmlp": 107, "unspecifi": [17, 65, 66, 67, 77], "unsqueez": 77, "unsqueeze_": 77, "until": [77, 107], "untyped_storag": 77, "untypedstorag": 77, "up": [7, 49, 77, 100, 101, 107], "updat": [7, 49, 71, 73, 77, 104, 107], "upgrad": 107, "upper": [36, 60, 77], "us": [0, 2, 3, 5, 7, 8, 9, 10, 11, 13, 19, 21, 22, 24, 25, 27, 29, 41, 46, 48, 49, 54, 56, 57, 58, 60, 63, 65, 66, 67, 76, 77, 83, 84, 85, 87, 89, 90, 91, 92, 94, 95, 97, 100, 101, 102, 103, 104, 105, 107], "usag": [18, 34, 77], "use_deterministic_algorithm": 77, "user": [24, 60, 65, 66, 67, 77, 89, 90, 91, 92, 94, 97, 100, 103, 104], "usual": [65, 66, 67, 77, 107], "util": [65, 66, 67, 77, 104, 107], "uu": [0, 14, 71, 74, 107], "v": [49, 65, 66, 67, 73, 77], "v1": [21, 24, 103], "v_": [65, 66], "v_0": [65, 66], "v_t": [65, 66], "valid": [2, 46, 54, 77, 92, 107], "valu": [2, 3, 5, 7, 9, 10, 11, 15, 17, 23, 24, 36, 41, 46, 47, 56, 57, 58, 60, 65, 66, 67, 71, 73, 77, 86, 89, 90, 92, 100, 101, 103, 107], "valueerror": 27, "var": [9, 77], "variabl": [77, 100], "varianc": [39, 82], "variant": [65, 66], "variou": [89, 91], "vdot": 77, "ve": 105, "vec": 77, "vec1": 77, "vec2": 77, "vector": [7, 49, 54, 77], "veri": 107, "verif": [65, 66, 67], "versa": 77, "version": [1, 24, 38, 44, 54, 65, 66, 67, 77, 85, 91, 92, 94, 95, 97, 103, 107], "versu": 60, "via": [9, 77, 86, 87, 91, 92, 101, 107], "vice": 77, "video": [77, 104], "view": [23, 73, 77], "view_a": 77, "visibl": 4, "vision": [3, 47], "visual": 2, "visualis": [21, 23, 91, 104], "vjp": 100, "vmap": 100, "vocab_s": 18, "vocabulari": 18, "vsplit": 77, "w": [3, 7, 9, 49, 77, 100], "w_": 3, "w_c": 3, "wa": [16, 61, 77, 101], "wai": [5, 77, 85, 92, 100, 107], "wait": 77, "wait_stream": 77, "want": [77, 105], "warn": [19, 24, 60, 103], "we": [2, 24, 65, 66, 67, 77, 87, 90, 92, 94, 100, 101, 103, 104, 105, 107], "weight": [2, 3, 7, 9, 10, 11, 14, 15, 16, 18, 19, 36, 46, 47, 49, 51, 52, 53, 56, 57, 58, 59, 60, 61, 65, 66, 67, 71, 74, 77, 89, 92, 102, 107], "weight_decai": [65, 66, 67, 71], "weight_mup_typ": [2, 10, 11], "welcom": 104, "well": [3, 65, 66, 67, 77, 105, 107], "were": [77, 100], "weren": 100, "what": [2, 5, 60, 65, 66, 67, 86, 100, 104], "whatev": [60, 77], "when": [2, 3, 5, 8, 9, 10, 11, 15, 17, 46, 47, 50, 54, 60, 65, 66, 67, 77, 89, 90, 91, 95, 97, 101, 107], "where": [2, 3, 7, 8, 9, 10, 11, 16, 17, 47, 49, 50, 52, 53, 54, 60, 61, 62, 65, 66, 67, 77, 89, 91, 92, 101, 107], "wherea": 77, "whether": [65, 66, 67, 77, 92, 100], "which": [0, 3, 5, 9, 13, 17, 24, 58, 60, 63, 65, 66, 67, 71, 77, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 100, 101, 103, 107], "while": [3, 8, 12, 16, 17, 47, 50, 60, 61, 62, 63, 77, 100], "who": [104, 105, 107], "whole": 5, "whose": [77, 102], "why": [60, 107], "wide": [24, 92, 103], "widehat": [65, 66], "width": 77, "wikitext": [21, 24, 103], "win_length": 77, "window": 77, "wise": [38, 50, 55], "wish": [94, 104, 107], "within": [20, 77, 85, 94, 101, 107], "without": [65, 66, 67, 77, 85, 102, 105, 107], "won": 77, "word": [2, 7, 49], "work": [24, 60, 77, 85, 93, 103, 104, 105, 107], "would": [77, 100], "wouldn": 77, "wrap": [77, 95, 101, 102, 107], "wrapper": [0, 64, 92, 94], "write": 77, "written": 85, "wrong": 100, "wrt": 77, "x": [3, 8, 9, 15, 16, 23, 37, 41, 50, 61, 62, 76, 77, 92, 100, 102, 107], "x86_64": 102, "x_": [3, 17, 63], "x_i": [17, 63], "x_j": [17, 63], "x_npy": 100, "x_t": 100, "xa": [52, 53], "xavier": [29, 107], "xdoctest": [7, 49, 67, 100], "xla": 77, "xlogi": 77, "xlogy_": 77, "xmax": 23, "xmin": 23, "xpu": 77, "y": [3, 9, 15, 52, 53, 77, 100], "y_": 3, "y_n": 3, "y_t": 100, "yet": [65, 66, 77, 92, 104, 107], "you": [2, 3, 46, 52, 53, 54, 60, 65, 66, 67, 77, 100, 101, 105, 107], "your": [77, 100, 107], "z": [77, 100], "zero": [2, 6, 7, 9, 14, 48, 49, 60, 65, 66, 67, 74, 77, 92, 100], "zero_": [49, 77], "zero_grad": [65, 66, 67], "zero_point": 77, "zeros_lik": 100, "zip": [65, 66, 67], "\u00b5p": 104, "\u00e0": 2, "\u03bcp": [14, 39, 72, 74, 75, 82, 104, 106]}, "titles": ["3. API reference", "3.1. unit_scaling", "3.1.4. unit_scaling.Conv1d", "3.1.5. unit_scaling.CrossEntropyLoss", "3.1.6. unit_scaling.DepthModuleList", "3.1.7. unit_scaling.DepthSequential", "3.1.8. unit_scaling.Dropout", "3.1.9. unit_scaling.Embedding", "3.1.10. unit_scaling.GELU", "3.1.11. unit_scaling.LayerNorm", "3.1.12. unit_scaling.Linear", "3.1.13. unit_scaling.LinearReadout", "3.1.14. unit_scaling.MHSA", "3.1.15. unit_scaling.MLP", "3.1.1. unit_scaling.Parameter", "3.1.16. unit_scaling.RMSNorm", "3.1.17. unit_scaling.SiLU", "3.1.18. unit_scaling.Softmax", "3.1.19. unit_scaling.TransformerDecoder", "3.1.20. unit_scaling.TransformerLayer", "3.2. unit_scaling.analysis", "3.2.1. unit_scaling.analysis.example_batch", "3.2.2. unit_scaling.analysis.graph_to_dataframe", "3.2.3. unit_scaling.analysis.plot", "3.2.4. unit_scaling.analysis.visualiser", "3.3. unit_scaling.constraints", "3.3.1. unit_scaling.constraints.amean", "3.3.2. unit_scaling.constraints.apply_constraint", "3.3.3. unit_scaling.constraints.gmean", "3.3.4. unit_scaling.constraints.hmean", "3.3.5. unit_scaling.constraints.to_grad_input_scale", "3.3.6. unit_scaling.constraints.to_left_grad_scale", "3.3.7. unit_scaling.constraints.to_output_scale", "3.3.8. unit_scaling.constraints.to_right_grad_scale", "3.1.21. unit_scaling.core", "3.1.21.1. unit_scaling.core.functional", "3.1.21.1.1. unit_scaling.core.functional.logarithmic_interpolation", "3.1.21.1.2. unit_scaling.core.functional.rms", "3.1.21.1.3. unit_scaling.core.functional.scale_elementwise", "3.1.21.1.4. unit_scaling.core.functional.transformer_residual_scaling_rule", "3.4. unit_scaling.formats", "3.4.3. unit_scaling.formats.FPFormat", "3.4.1. unit_scaling.formats.format_to_tuple", "3.4.2. unit_scaling.formats.tuple_to_format", "3.1.22. unit_scaling.functional", "3.1.22.1. unit_scaling.functional.add", "3.1.22.2. unit_scaling.functional.conv1d", "3.1.22.3. unit_scaling.functional.cross_entropy", "3.1.22.4. unit_scaling.functional.dropout", "3.1.22.5. unit_scaling.functional.embedding", "3.1.22.6. unit_scaling.functional.gelu", "3.1.22.7. unit_scaling.functional.layer_norm", "3.1.22.8. unit_scaling.functional.linear", "3.1.22.9. unit_scaling.functional.linear_readout", "3.1.22.10. unit_scaling.functional.matmul", "3.1.22.11. unit_scaling.functional.mse_loss", "3.1.22.12. unit_scaling.functional.residual_add", "3.1.22.13. unit_scaling.functional.residual_apply", "3.1.22.14. unit_scaling.functional.residual_split", "3.1.22.15. unit_scaling.functional.rms_norm", "3.1.22.16. unit_scaling.functional.scaled_dot_product_attention", "3.1.22.17. unit_scaling.functional.silu", "3.1.22.18. unit_scaling.functional.silu_glu", "3.1.22.19. unit_scaling.functional.softmax", "3.1.23. unit_scaling.optim", "3.1.23.5. unit_scaling.optim.Adam", "3.1.23.6. unit_scaling.optim.AdamW", "3.1.23.7. unit_scaling.optim.SGD", "3.1.23.1. unit_scaling.optim.lr_scale_for_depth", "3.1.23.2. unit_scaling.optim.lr_scale_func_adam", "3.1.23.3. unit_scaling.optim.lr_scale_func_sgd", "3.1.23.4. unit_scaling.optim.scaled_parameters", "3.1.24. unit_scaling.parameter", "3.1.24.3. unit_scaling.parameter.OrderedDict", "3.1.24.1. unit_scaling.parameter.Parameter", "3.1.24.4. unit_scaling.parameter.ParameterData", "3.1.24.5. unit_scaling.parameter.Protocol", "3.1.24.6. unit_scaling.parameter.Tensor", "3.1.24.2. unit_scaling.parameter.has_parameter_data", "3.5. unit_scaling.scale", "3.5.1. unit_scaling.scale.scale_bwd", "3.5.2. unit_scaling.scale.scale_fwd", "3.1.2. unit_scaling.transformer_residual_scaling_rule", "3.6. unit_scaling.transforms", "3.6.9. unit_scaling.transforms.Metrics", "3.6.1. unit_scaling.transforms.compile", "3.6.2. unit_scaling.transforms.prune_non_float_tensors", "3.6.3. unit_scaling.transforms.prune_same_scale_tensors", "3.6.4. unit_scaling.transforms.prune_selected_nodes", "3.6.5. unit_scaling.transforms.simulate_format", "3.6.6. unit_scaling.transforms.simulate_fp8", "3.6.7. unit_scaling.transforms.track_scales", "3.6.8. unit_scaling.transforms.unit_scale", "3.7. unit_scaling.transforms.utils", "3.7.1. unit_scaling.transforms.utils.apply_transform", "3.7.2. unit_scaling.transforms.utils.patch_to_expand_modules", "3.7.3. unit_scaling.transforms.utils.replace_node_with_function", "3.7.4. unit_scaling.transforms.utils.torch_nn_modules_to_user_modules", "3.8. unit_scaling.utils", "3.8.2. unit_scaling.utils.ScalePair", "3.8.3. unit_scaling.utils.ScaleTracker", "3.8.4. unit_scaling.utils.ScaleTrackingInterpreter", "3.8.1. unit_scaling.utils.analyse_module", "3.1.3. unit_scaling.visualiser", "Unit Scaling", "2. Limitations", "Almost-scaled dot-product attention", "1. User guide"], "titleterms": {"adam": 65, "adamw": 66, "add": 45, "almost": 106, "amean": 26, "analyse_modul": 102, "analysi": [20, 21, 22, 23, 24], "api": 0, "apply_constraint": 27, "apply_transform": 94, "attent": 106, "compil": 85, "consider": 107, "constraint": [25, 26, 27, 28, 29, 30, 31, 32, 33], "content": 104, "conv1d": [2, 46], "core": [34, 35, 36, 37, 38, 39], "cross_entropi": 47, "crossentropyloss": 3, "depthmodulelist": 4, "depthsequenti": 5, "develop": 104, "dot": 106, "dropout": [6, 48], "embed": [7, 49], "example_batch": 21, "format": [40, 41, 42, 43], "format_to_tupl": 42, "fpformat": 41, "function": [35, 36, 37, 38, 39, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63], "gelu": [8, 50], "get": 104, "gmean": 28, "graph_to_datafram": 22, "guid": 107, "has_parameter_data": 78, "hmean": 29, "how": 107, "i": 107, "instal": [104, 107], "kei": 107, "layer_norm": 51, "layernorm": 9, "limit": 105, "linear": [10, 52], "linear_readout": 53, "linearreadout": 11, "logarithmic_interpol": 36, "lr_scale_for_depth": 68, "lr_scale_func_adam": 69, "lr_scale_func_sgd": 70, "matmul": 54, "metric": 84, "mhsa": 12, "mlp": 13, "model": 107, "mse_loss": 55, "optim": [64, 65, 66, 67, 68, 69, 70, 71], "optimis": 107, "ordereddict": 73, "paramet": [14, 72, 73, 74, 75, 76, 77, 78], "parameterdata": 75, "patch_to_expand_modul": 95, "plot": 23, "product": 106, "protocol": 76, "prune_non_float_tensor": 86, "prune_same_scale_tensor": 87, "prune_selected_nod": 88, "refer": 0, "replace_node_with_funct": 96, "residual_add": 56, "residual_appli": 57, "residual_split": 58, "rm": 37, "rms_norm": 59, "rmsnorm": 15, "scale": [79, 80, 81, 104, 106, 107], "scale_bwd": 80, "scale_elementwis": 38, "scale_fwd": 81, "scaled_dot_product_attent": 60, "scaled_paramet": 71, "scalepair": 99, "scaletrack": 100, "scaletrackinginterpret": 101, "sgd": 67, "silu": [16, 61], "silu_glu": 62, "simulate_format": 89, "simulate_fp8": 90, "softmax": [17, 63], "start": 104, "tensor": 77, "to_grad_input_scal": 30, "to_left_grad_scal": 31, "to_output_scal": 32, "to_right_grad_scal": 33, "torch_nn_modules_to_user_modul": 97, "track_scal": 91, "transform": [83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97], "transformer_residual_scaling_rul": [39, 82], "transformerdecod": 18, "transformerlay": 19, "tuple_to_format": 43, "unit": [104, 107], "unit_sc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103], "unit_scal": 92, "user": 107, "util": [93, 94, 95, 96, 97, 98, 99, 100, 101, 102], "visualis": [24, 103], "what": 107}}) \ No newline at end of file diff --git a/user_guide.html b/user_guide.html index 3129a3e..0f980c7 100644 --- a/user_guide.html +++ b/user_guide.html @@ -97,7 +97,7 @@

1. User guide

1.1. Installation

To install the unit-scaling library, run:

-
pip install git+https://github.com/graphcore-research/unit-scaling.git
+
pip install unit-scaling
 

For those who wish to develop on the unit-scaling codebase, clone or fork our