diff --git a/package.json b/package.json
index d1d4e92..411a295 100644
--- a/package.json
+++ b/package.json
@@ -10,7 +10,7 @@
     "start": "http-server -c-1",
     "build-clean": "npm run clean && npm run build && npm run build-bundle",
     "dev": "npm run build-clean && npm run start",
-    "prettier": "prettier --write .",
+    "prettier": "prettier --write tests/**/* src/**/*.ts",
     "build-bundle": "esbuild src/index.ts --bundle --outfile=dist/bundle.js --format=esm --target=es2020",
     "unit": "npm run build-clean && node --experimental-vm-modules node_modules/jest/bin/jest.js",
     "integration": "npm run build-clean && npx playwright test",
diff --git a/src/autograd/function.ts b/src/autograd/function.ts
index b460d0c..5771c33 100644
--- a/src/autograd/function.ts
+++ b/src/autograd/function.ts
@@ -155,21 +155,22 @@ export abstract class BinaryOp extends AutogradFunction {
     pass.setPipeline(this.pipeline);
     pass.setBindGroup(0, bindGroup);
 
+    // TODO: set these as overrides in the layers/ops level since the kernels are different
     const WORKGROUP_SIZE = 16;
     const TILE_SIZE = 8;
-    const workgropuA = Math.ceil(a.shape[0] / (TILE_SIZE * WORKGROUP_SIZE));
-    const workgropuB = Math.ceil(b.shape[1] / (TILE_SIZE * WORKGROUP_SIZE));
+    const workgroupA = Math.ceil(a.shape[0] / (TILE_SIZE * WORKGROUP_SIZE));
+    const workgroupB = Math.ceil(b.shape[1] / (TILE_SIZE * WORKGROUP_SIZE));
     console.log(
       "a.shape[0]:",
       a.shape[0],
       "b.shape[1]:",
       b.shape[1],
       "launching workgroups",
-      workgropuA,
+      workgroupA,
       ",",
-      workgropuB,
+      workgroupB,
     );
-    pass.dispatchWorkgroups(workgropuA, workgropuB);
+    pass.dispatchWorkgroups(workgroupA, workgroupB);
     pass.end();
 
     const stagingBuffer = this.device.createBuffer({
diff --git a/src/index.ts b/src/index.ts
index ce384a6..4eb468d 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -7,4 +7,12 @@ export * from "./ops/exp2.js";
 export * from "./ops/log2.js";
 export * from "./ops/ln.js";
 export * from "./ops/relu.js";
+export * from "./ops/div.js";
 export * from "./autograd/function.js";
+export * from "./layers/module.js";
+export * from "./layers/embedding.js";
+export * from "./layers/linear.js";
+export * from "./layers/norm.js";
+export * from "./layers/mlp.js";
+export * from "./layers/attention.js";
+export * from "./model/nomic_embed.js"
\ No newline at end of file
diff --git a/src/layers/attention.ts b/src/layers/attention.ts
new file mode 100644
index 0000000..236f727
--- /dev/null
+++ b/src/layers/attention.ts
@@ -0,0 +1,111 @@
+import { Tensor } from "../tensor/tensor.js";
+import { Module } from "./module.js";
+import { Linear } from "./linear.js";
+
+export class MultiHeadAttention extends Module {
+  qkv: Linear; // Combined projection for Query, Key, Value
+  output: Linear; // Output projection
+  num_heads: number;
+  head_dim: number;
+  hidden_dim: number;
+
+  constructor(hidden_dim: number, num_heads: number) {
+    super("multihead_attention");
+
+    this.num_heads = num_heads;
+    this.head_dim = Math.floor(hidden_dim / num_heads);
+    this.hidden_dim = hidden_dim;
+
+    if (this.head_dim * num_heads !== hidden_dim) {
+      throw new Error(
+        `Hidden dimension ${hidden_dim} must be divisible by number of heads ${num_heads}`,
+      );
+    }
+
+    // Combined QKV projection
+    // Projects to 3x hidden_dim for Q, K, V
+    this.qkv = new Linear(hidden_dim, hidden_dim * 3);
+
+    // Output projection
+    this.output = new Linear(hidden_dim, hidden_dim);
+  }
+
+  private async reshapeToHeads(tensor: Tensor): Promise<Tensor[]> {
+    const heads: Tensor[] = [];
+
+    // Each head will be (seqlen, head_dim)
+    for (let i = 0; i < this.num_heads; i++) {
+      const start = i * this.head_dim;
+      const end = start + this.head_dim;
+      const head = await tensor.slice(":", [start, end]);
+      heads.push(head);
+    }
+
+    return heads;
+  }
+
+  private async scaledDotProductAttention(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+  ): Promise<[Tensor, number]> {
+    // Scale factor is 1/sqrt(head_dim)
+    const scale = 1 / Math.sqrt(this.head_dim);
+    const scaleTensor = Tensor.full(
+      [query.shape[0], key.shape[0]],
+      scale,
+      false,
+    );
+    // Compute attention scores
+    const [scores] = await query.matmul(key.transpose());
+    const [scaledScores] = await scores.mul(scaleTensor);
+
+    // Softmax implementation
+    const [expScores] = await scaledScores.exp();
+    const sumExp = await expScores.sum([1]);
+
+    const [attention] = await expScores.div(sumExp);
+
+    // Apply attention to values
+    return attention.matmul(value);
+  }
+
+  async forward(input: Tensor): Promise<[Tensor]> {
+    // Project input to Q, K, V
+    const [qkv] = await this.qkv.forward(input);
+
+    // Split into Q, K, V
+    const query = await qkv.slice(":", [0, this.hidden_dim]);
+    const key = await qkv.slice(":", [this.hidden_dim, this.hidden_dim * 2]);
+    const value = await qkv.slice(":", [
+      this.hidden_dim * 2,
+      this.hidden_dim * 3,
+    ]);
+
+    // Split each of Q, K, V into heads
+    const queryHeads = await this.reshapeToHeads(query);
+    const keyHeads = await this.reshapeToHeads(key);
+    const valueHeads = await this.reshapeToHeads(value);
+
+    // Compute attention for each head
+    // this will be slow, we should create bmm
+    const headOutputs: Tensor[] = [];
+    for (let i = 0; i < this.num_heads; i++) {
+      const [headOutput] = await this.scaledDotProductAttention(
+        queryHeads[i],
+        keyHeads[i],
+        valueHeads[i],
+      );
+      headOutputs.push(headOutput);
+    }
+
+    // Concatenate heads
+    let concatOutput = headOutputs[0];
+    for (let i = 1; i < headOutputs.length; i++) {
+      concatOutput = await concatOutput.concat(headOutputs[i], 1);
+    }
+
+    // Final output projection
+    return this.output.forward(concatOutput);
+  }
+}
diff --git a/src/layers/embedding.ts b/src/layers/embedding.ts
new file mode 100644
index 0000000..7608a5b
--- /dev/null
+++ b/src/layers/embedding.ts
@@ -0,0 +1,121 @@
+import { Tensor } from "../tensor/tensor.js";
+import { Module } from "./module.js";
+
+export class Embedding extends Module {
+  vocab_size: number;
+  emb_dim: number;
+  embedding: Tensor;
+  constructor(vocab_size: number, emb_dim: number) {
+    super("embedding");
+
+    this.vocab_size = vocab_size;
+    this.emb_dim = emb_dim;
+    this.embedding = Tensor.normal([vocab_size, emb_dim], true, 0.02);
+  }
+
+  async forward(...inputs: [Tensor]): Promise<[Tensor]> {
+    const [embeddings] = await this.embedding.gather(inputs[0]);
+    return [embeddings];
+  }
+}
+
+export class RotaryEmbedding extends Module {
+  base: number;
+  dimension: number;
+  theta: Tensor;
+  sequenceLength: number;
+  idxTheta: Tensor | null = null;
+  constructor(base: number, dimension: number) {
+    super("rope_embedding");
+    this.base = base;
+    this.dimension = dimension;
+
+    const theta = this.createTheta(dimension, base);
+    this.theta = new Tensor(theta, [1, dimension / 2], true);
+
+    this.sequenceLength = 0;
+    this.idxTheta = null;
+  }
+
+  createTheta(dimension: number, base: number = 10000): Float32Array {
+    // Create a new Float32Array of the specified size
+    const result = new Float32Array(dimension / 2);
+
+    // Calculate values for each position
+    for (let i = 0; i < dimension; i += 2) {
+      const value = 1.0 / Math.pow(base, i / dimension);
+      result[i / 2] = value;
+    }
+
+    return result;
+  }
+
+  async buildCache(sequenceLength: number) {
+    const posIdx = new Float32Array(sequenceLength);
+    for (let i = 0; i < sequenceLength; i++) {
+      posIdx[i] = i;
+    }
+
+    const posTensor = new Tensor(posIdx, [sequenceLength, 1], true);
+    let [idxTheta] = await posTensor.matmul(this.theta);
+
+    idxTheta = await idxTheta.concat(idxTheta, 1);
+
+    return [idxTheta];
+  }
+
+  async forward(...inputs: [Tensor]): Promise<[Tensor]> {
+    const [x] = inputs;
+
+    const currSeqLen = x.shape[0];
+    const d2 = Math.floor(this.dimension / 2);
+
+    if (currSeqLen > this.sequenceLength || this.idxTheta === null) {
+      const [cache] = await this.buildCache(currSeqLen);
+      this.sequenceLength = currSeqLen;
+      this.idxTheta = cache;
+    }
+
+    const idxTheta = this.idxTheta;
+
+    const idxThetaLength = idxTheta.data.length;
+    const cosIdxThetaArr = new Float32Array(idxThetaLength);
+    const sinIdxThetaArr = new Float32Array(idxThetaLength);
+
+    for (let i = 0; i < idxThetaLength; i++) {
+      cosIdxThetaArr[i] = Math.cos(idxTheta.data[i]);
+      sinIdxThetaArr[i] = Math.sin(idxTheta.data[i]);
+    }
+
+    const cosIdxTheta = new Tensor(
+      cosIdxThetaArr,
+      [currSeqLen, this.dimension],
+      x.requires_grad,
+    );
+    const sinIdxTheta = new Tensor(
+      sinIdxThetaArr,
+      [currSeqLen, this.dimension],
+      x.requires_grad,
+    );
+
+    // Rewrite using tensor operations and select
+    const leftHalf = await x.slice(":", [null, d2]);
+    const rightHalf = await x.slice(":", [d2, this.dimension]);
+    const [negHalf] = await rightHalf.mul(Tensor.full([1], -1));
+
+    const half = await negHalf.concat(leftHalf, 1);
+    const xRope = await x.slice(":", [null, this.dimension]);
+
+    const [xRopePos] = await xRope.mul(cosIdxTheta);
+    const [xRopeNeg] = await half.mul(sinIdxTheta);
+
+    let [rope] = await xRopePos.add(xRopeNeg);
+    if (this.dimension < x.shape[1]) {
+      const xPass = await x.slice(":", [null, null, d2]);
+
+      rope = await rope.concat(xPass, 1);
+    }
+
+    return [rope];
+  }
+}
diff --git a/src/layers/linear.ts b/src/layers/linear.ts
new file mode 100644
index 0000000..5aea6af
--- /dev/null
+++ b/src/layers/linear.ts
@@ -0,0 +1,20 @@
+import { Tensor } from "../tensor/tensor.js";
+import { Module } from "./module.js";
+
+export class Linear extends Module {
+  weight: Tensor;
+  bias: Tensor;
+
+  constructor(inputSize: number, outputSize: number) {
+    super("linear");
+    this.weight = Tensor.normal([inputSize, outputSize], true, 0.02);
+    this.bias = Tensor.full([outputSize], 0, true);
+  }
+
+  async forward(...inputs: [Tensor]): Promise<[Tensor]> {
+    const [input] = inputs;
+    const [output] = await input.matmul(this.weight);
+    const [outputBias] = await output.add(this.bias);
+    return [outputBias];
+  }
+}
diff --git a/src/layers/mlp.ts b/src/layers/mlp.ts
new file mode 100644
index 0000000..a338091
--- /dev/null
+++ b/src/layers/mlp.ts
@@ -0,0 +1,129 @@
+import { Tensor } from "../tensor/tensor.js";
+import { Module } from "./module.js";
+import { Linear } from "./linear.js";
+
+type ActivationType = "relu" | "silu" | "gelu" | "swiglu" | "none";
+
+export class MLP extends Module {
+  up: Linear; // Project up to larger dimension
+  down: Linear; // Project back down
+  activation: ActivationType;
+
+  constructor(
+    dim: number, // input/output dimension
+    hiddenDim: number, // hidden dimension
+    activation: ActivationType = "relu",
+  ) {
+    super("mlp");
+
+    // For SwiGLU, we need double the hidden dimension for gating
+    const actualHiddenDim = activation === "swiglu" ? hiddenDim * 2 : hiddenDim;
+
+    this.up = new Linear(dim, actualHiddenDim);
+    this.down = new Linear(hiddenDim, dim);
+    this.activation = activation;
+  }
+
+  private async gelu(x: Tensor): Promise<[Tensor, number]> {
+    // GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3)))
+    const sqrt2OverPi = Math.sqrt(2 / Math.PI);
+
+    // Calculate x^3
+    const [xSquared] = await x.mul(x);
+    const [xCubed] = await xSquared.mul(x);
+
+    // Calculate 0.044715 * x^3
+    const [scaledCube] = await xCubed.mul(
+      Tensor.full(x.shape, 0.044715, false),
+    );
+
+    // Add x to the scaled cube
+    const [innerSum] = await x.add(scaledCube);
+
+    // Multiply by sqrt(2/π)
+    const [scaled] = await innerSum.mul(
+      Tensor.full(x.shape, sqrt2OverPi, false),
+    );
+
+    // Calculate tanh using (e^x - e^-x)/(e^x + e^-x)
+    const [exp] = await scaled.exp();
+    const [negScaled] = await scaled.mul(Tensor.full(x.shape, -1, false));
+    const [negExp] = await negScaled.exp();
+
+    const [numerator] = await exp.sub(negExp);
+    const [denominator] = await exp.add(negExp);
+
+    const [tanh] = await numerator.div(denominator);
+
+    // Add 1 to tanh result
+    const [tanhPlusOne] = await tanh.add(Tensor.full(x.shape, 1, false));
+
+    // Multiply by x
+    const [xTimesSum] = await x.mul(tanhPlusOne);
+
+    // Multiply by 0.5 for final result
+    return xTimesSum.mul(Tensor.full(x.shape, 0.5, false));
+  }
+
+  private async silu(x: Tensor): Promise<[Tensor, number]> {
+    const [negX] = await x.mul(Tensor.full(x.shape, -1, false));
+    const [expNegX] = await negX.exp();
+    const [onePlusExpNegX] = await expNegX.add(Tensor.full(x.shape, 1, false));
+
+    const [sigmoid] = await Tensor.full(x.shape, 1, false).div(onePlusExpNegX);
+    return x.mul(sigmoid);
+  }
+
+  private async applyActivation(x: Tensor): Promise<[Tensor, number]> {
+    switch (this.activation) {
+      case "relu":
+        return x.relu();
+      case "silu":
+        return this.silu(x);
+      case "gelu":
+        return this.gelu(x);
+      case "swiglu": {
+        // Split the tensor in half for gate and value paths
+        const halfSize = Math.floor(x.shape[x.shape.length - 1] / 2);
+        const [gate, value] = await Promise.all([
+          x.slice(":", [0, halfSize]),
+          x.slice(":", [halfSize, x.shape[x.shape.length - 1]]),
+        ]);
+        const [gateActivated] = await this.silu(gate);
+        return gateActivated.mul(value);
+      }
+      case "none":
+        return [x, -1];
+      default:
+        throw new Error(`Unknown activation type: ${this.activation}`);
+    }
+  }
+
+  async forward(...inputs: [Tensor]): Promise<[Tensor]> {
+    const [input] = inputs;
+
+    // Project up to hidden dimension
+    const [upProjected] = await this.up.forward(input);
+
+    // Apply activation
+    const [activated] = await this.applyActivation(upProjected);
+
+    // Project back down
+    return this.down.forward(activated);
+  }
+
+  // Helper method for creating standard configurations
+  static create(config: {
+    dim: number; // input/output dimension
+    hiddenMul?: number; // multiplier for hidden dimension (default 4)
+    activation?: ActivationType;
+  }): MLP {
+    const {
+      dim,
+      hiddenMul = 4, // typical transformer uses 4x dimension for FFN
+      activation = "relu",
+    } = config;
+
+    return new MLP(dim, dim * hiddenMul, activation);
+  }
+}
diff --git a/src/layers/module.ts b/src/layers/module.ts
new file mode 100644
index 0000000..5a4a88b
--- /dev/null
+++ b/src/layers/module.ts
@@ -0,0 +1,19 @@
+import { Tensor } from "../tensor/tensor";
+export abstract class Module {
+  protected name: string;
+  constructor(name: string) {
+    if (name === null || name === undefined) {
+      throw Error("Name cannot be null or undefined");
+    }
+
+    this.name = name;
+  }
+
+  /**
+   * Abstract method that must be implemented by all layer subclasses
+   * Defines the forward pass computation of the layer
+   * @param inputs - Input tensor(s) to the layer
+   * @returns Output tensor(s) from the layer
+   */
+  abstract forward(...args: Tensor[]): Promise<[Tensor]>;
+}
diff --git a/src/layers/norm.ts b/src/layers/norm.ts
new file mode 100644
index 0000000..8adda7d
--- /dev/null
+++ b/src/layers/norm.ts
@@ -0,0 +1,40 @@
+import { Tensor } from "../tensor/tensor.js";
+import { Module } from "./module.js";
+
+export class LayerNorm extends Module {
+  normalized_shape: number[];
+  eps: Tensor;
+  gamma: Tensor;
+  beta: Tensor;
+  constructor(normalized_shape: number[], eps: number) {
+    super("layer_norm");
+    this.normalized_shape = normalized_shape;
+    // eps should be [2, 1] for broadcasting
+    this.eps = Tensor.full([1], eps); // Make eps a scalar tensor
+    // gamma and beta should match the feature dimension
+    this.gamma = Tensor.full([1, normalized_shape[0]], 1); // [1, 3] for broadcasting
+    this.beta = Tensor.full([1, normalized_shape[0]], 0); // [1, 3] for broadcasting
+  }
+
+  async forward(x: Tensor): Promise<[Tensor]> {
+    const reduction_dims = [1]; // Reduce over the feature dimension
+
+    // Calculate mean and reshape for broadcasting
+    const mean = await x.mean(reduction_dims);
+    console.log("mean.data", mean.data.toString());
+    mean.shape = [mean.shape[0], 1]; // [2, 1]
+
+    const variance = await x.variance(reduction_dims);
+    variance.shape = [variance.shape[0], 1]; // [2, 1]
+
+    const [numerator] = await x.sub(mean); // [2, 3]
+    console.log("numerator.data", numerator.data.toString());
+    const [denominator] = await variance.add(this.eps);
+    const sqrtDenom = await denominator.sqrt();
+    const [normalized] = await numerator.div(sqrtDenom);
+
+    const [gamma] = await normalized.mul(this.gamma); // [2, 3] * [1, 3] -> [2, 3]
+    const [beta] = await gamma.add(this.beta); // [2, 3] + [1, 3] -> [2, 3]
+    return [beta];
+  }
+}
diff --git a/src/model/nomic_embed.ts b/src/model/nomic_embed.ts
new file mode 100644
index 0000000..7a64acc
--- /dev/null
+++ b/src/model/nomic_embed.ts
@@ -0,0 +1,195 @@
+import { Tensor } from "../tensor/tensor.js";
+import { Module } from "../layers/module.js";
+import { LayerNorm } from "../layers/norm.js";
+import { MultiHeadAttention } from "../layers/attention.js";
+import { MLP } from "../layers/mlp.js";
+import { Embedding } from "../layers/embedding.js";
+
+export interface NomicEmbedConfig {
+  vocab_size: number;
+  hidden_size: number;
+  num_hidden_layers: number;
+  num_attention_heads: number;
+  intermediate_size: number;
+  hidden_act: string;
+  hidden_dropout_prob: number;
+  attention_probs_dropout_prob: number;
+  max_position_embeddings: number;
+  type_vocab_size: number;
+  initializer_range: number;
+  layer_norm_eps: number;
+  pad_token_id: number;
+  position_embedding_type: string;
+  use_cache: boolean;
+  classifier_dropout: number | null;
+  rotary_emb_fraction: number;
+  use_flash_attn: boolean;
+  qkv_proj_bias: boolean;
+  mlp_fc1_bias: boolean;
+  mlp_fc2_bias: boolean;
+  causal: boolean;
+}
+
+class NomicBertEmbeddings extends Module {
+  private wordEmbeddings: Embedding;
+  private positionEmbeddings: Embedding | null;
+  private typeEmbeddings: Embedding | null;
+  private maxPositionEmbeddings: number;
+  private typeVocabSize: number;
+
+  constructor(config: NomicEmbedConfig) {
+    super("bert_embeddings");
+
+    // Word embeddings
+    this.wordEmbeddings = new Embedding(config.vocab_size, config.hidden_size);
+
+    // Position embeddings if using absolute positions
+    this.maxPositionEmbeddings = config.max_position_embeddings;
+    this.positionEmbeddings =
+      this.maxPositionEmbeddings > 0 && config.rotary_emb_fraction <= 0
+        ? new Embedding(config.max_position_embeddings, config.hidden_size)
+        : null;
+
+    // Token type embeddings if used
+    this.typeVocabSize = config.type_vocab_size;
+    this.typeEmbeddings =
+      this.typeVocabSize > 0
+        ? new Embedding(config.type_vocab_size, config.hidden_size)
+        : null;
+  }
+
+  async forward(
+    inputIds: Tensor,
+    positionIds?: Tensor,
+    tokenTypeIds?: Tensor,
+    inputsEmbeds?: Tensor,
+  ): Promise<[Tensor]> {
+    // Get word embeddings
+    let [embeddings] = inputsEmbeds
+      ? [inputsEmbeds]
+      : await this.wordEmbeddings.forward(inputIds);
+
+    // Add token type embeddings if used
+    // if (this.typeEmbeddings && this.typeVocabSize > 0 && tokenTypeIds) {
+    //   const [typeEmbeddings] = await this.typeEmbeddings.forward(tokenTypeIds);
+    //   console.log("typeEmbeddings.data", typeEmbeddings.data.toString());
+    //   console.log("typeEmbeddings.shape", typeEmbeddings.shape);
+    //   [embeddings] = await embeddings.add(typeEmbeddings);
+    // }
+
+    return [embeddings];
+  }
+}
+
+class NomicBertLayer extends Module {
+  private attention: MultiHeadAttention;
+  private mlp: MLP;
+  private layerNorm1: LayerNorm;
+  private layerNorm2: LayerNorm;
+
+  constructor(config: NomicEmbedConfig) {
+    super("bert_layer");
+    this.attention = new MultiHeadAttention(
+      config.hidden_size,
+      config.num_attention_heads,
+    );
+    this.mlp = new MLP(config.hidden_size, config.intermediate_size);
+    this.layerNorm1 = new LayerNorm(
+      [config.hidden_size],
+      config.layer_norm_eps,
+    );
+    this.layerNorm2 = new LayerNorm(
+      [config.hidden_size],
+      config.layer_norm_eps,
+    );
+  }
+
+  async forward(...inputs: [Tensor]): Promise<[Tensor]> {
+    // Self-attention
+    const [hiddenStates] = inputs;
+    const [normed1] = await this.layerNorm1.forward(hiddenStates);
+    const [attnOutput] = await this.attention.forward(normed1);
+    const [residual1] = await hiddenStates.add(attnOutput);
+
+    // MLP
+    const [normed2] = await this.layerNorm2.forward(residual1);
+    const [mlpOutput] = await this.mlp.forward(normed2);
+    const [residual2] = await residual1.add(mlpOutput);
+    return [residual2];
+  }
+}
+
+class NomicBertEncoder extends Module {
+  private layers: NomicBertLayer[];
+
+  constructor(config: NomicEmbedConfig) {
+    super("bert_encoder");
+    this.layers = Array(config.num_hidden_layers)
+      .fill(null)
+      .map(() => new NomicBertLayer(config));
+  }
+
+  async forward(...args: Tensor[]): Promise<[Tensor]> {
+    let [hiddenStates, attentionMask] = args;
+    let currentOutput = hiddenStates;
+
+    // Pass through each layer
+    for (const layer of this.layers) {
+      [currentOutput] = await layer.forward(currentOutput);
+    }
+
+    return [currentOutput];
+  }
+}
+
+export class NomicEmbed extends Module {
+  private embeddings: NomicBertEmbeddings;
+  private encoder: NomicBertEncoder;
+  private emb_ln: LayerNorm;
+
+  constructor(config: NomicEmbedConfig) {
+    super("nomic_embed");
+
+    // Initialize components
+    this.embeddings = new NomicBertEmbeddings(config);
+    this.encoder = new NomicBertEncoder(config);
+    this.emb_ln = new LayerNorm([config.hidden_size], config.layer_norm_eps);
+  }
+
+  private async meanPooling(
+    modelOutput: Tensor,
+    attentionMask: Tensor,
+  ): Promise<[Tensor]> {
+    return [await modelOutput.mean([0])];
+  }
+
+  async forward(...args: Tensor[]): Promise<[Tensor]> {
+    // Get embeddings
+    const [inputIds, attentionMask, positionIds, tokenTypeIds] = args;
+    const [hidden] = await this.embeddings.forward(
+      inputIds,
+      positionIds,
+      tokenTypeIds,
+    );
+    console.log("hidden.data", hidden.data.toString());
+
+    // Apply layer norm
+    const [normed] = await this.emb_ln.forward(hidden);
+    console.log("normed.data", normed.data.toString());
+
+    // Pass through encoder
+    const [encoded] = await this.encoder.forward(normed, attentionMask);
+    // Mean pooling
+    console.log("encoded.data", encoded.data.toString());
+    const [pooled] = await this.meanPooling(encoded, attentionMask);
+    console.log("pooled.shape", pooled.shape);
+
+    const [norm] = await pooled.norm(2, 0);
+    console.log("norm.shape", norm.shape);
+    console.log("norm", norm.data.toString());
+
+    const [pooledNormed] = await pooled.div(norm);
+    // Normalize embeddings
+    return [pooledNormed];
+  }
+}
diff --git a/src/ops/add.ts b/src/ops/add.ts
index a8de232..a0476fa 100644
--- a/src/ops/add.ts
+++ b/src/ops/add.ts
@@ -10,12 +10,46 @@ export class Add extends BinaryOp {
       if (b.shape.length === 1 && b.shape[0] === 1) {
         // Broadcast scalar
         b = Tensor.full(a.shape, b.data[0], b.requires_grad);
+      } else if (b.shape.length === 1 && b.shape[0] === a.shape[1]) {
+        // Broadcast [m] to [n, m]
+        b = Tensor.broadcast(b, a.shape[0], b.requires_grad);
+      } else if (b.shape.length === 2 && b.shape[1] === 1) {
+        // Broadcast [n, 1] to [n, m]
+        const newShape = [b.shape[0], a.shape[1]];
+        console.log("Broadcasting [n,1] to shape:", newShape);
+        const newData = new Float32Array(newShape[0] * newShape[1]);
+        for (let i = 0; i < b.shape[0]; i++) {
+          for (let j = 0; j < a.shape[1]; j++) {
+            newData[i * a.shape[1] + j] = b.data[i];
+          }
+        }
+        b = new Tensor(newData, newShape, b.requires_grad);
+      } else if (
+        b.shape.length === 2 &&
+        b.shape[0] === 1 &&
+        b.shape[1] === a.shape[1]
+      ) {
+        // Broadcast [1, m] to [n, m]
+        const newShape = [a.shape[0], b.shape[1]];
+        console.log("Broadcasting [1,m] to shape:", newShape);
+        const newData = new Float32Array(newShape[0] * newShape[1]);
+        for (let i = 0; i < a.shape[0]; i++) {
+          for (let j = 0; j < b.shape[1]; j++) {
+            newData[i * b.shape[1] + j] = b.data[j];
+          }
+        }
+        b = new Tensor(newData, newShape, b.requires_grad);
       } else {
         throw new Error(
           `Incompatible shapes for Add: ${a.shape} and ${b.shape}`,
         );
       }
     }
+    console.log("add a.shape:", a.shape);
+    console.log("a.data:", a.data.toString());
+    console.log("add broadcasted b.shape:", b.shape);
+    console.log("b.data:", b.data.toString());
+
     return b;
   }
 
diff --git a/src/ops/div.ts b/src/ops/div.ts
new file mode 100644
index 0000000..828d84d
--- /dev/null
+++ b/src/ops/div.ts
@@ -0,0 +1,114 @@
+import { BinaryOp } from "../autograd/function.js";
+import { Tensor } from "../tensor/tensor.js";
+import { divShader } from "../shaders/div.js";
+
+export class Div extends BinaryOp {
+  protected readonly shader: string = divShader;
+
+  validateShapes(a: Tensor, b: Tensor): Tensor {
+    // Handle scalar case first
+    if (b.shape.length === 1 && b.shape[0] === 1) {
+      return Tensor.full(a.shape, b.data[0], b.requires_grad);
+    }
+
+    // Get dimensions of both tensors
+    const dimA = a.shape.length;
+    const dimB = b.shape.length;
+
+    // Calculate the number of dimensions in the output
+    const maxDim = Math.max(dimA, dimB);
+
+    // Pad shapes with 1s from the left to match max dimensions
+    const paddedA = Array(maxDim - dimA)
+      .fill(1)
+      .concat(a.shape);
+    const paddedB = Array(maxDim - dimB)
+      .fill(1)
+      .concat(b.shape);
+
+    // Check if shapes can be broadcast
+    const outputShape = [];
+    for (let i = 0; i < maxDim; i++) {
+      if (paddedA[i] === paddedB[i]) {
+        outputShape.push(paddedA[i]);
+      } else if (paddedA[i] === 1) {
+        outputShape.push(paddedB[i]);
+      } else if (paddedB[i] === 1) {
+        outputShape.push(paddedA[i]);
+      } else {
+        throw new Error(
+          `Incompatible shapes for broadcasting: ${a.shape} and ${b.shape}`,
+        );
+      }
+    }
+
+    // If shapes are already compatible, return original tensor
+    if (outputShape.every((dim, i) => dim === b.shape[i])) {
+      return b;
+    }
+
+    // Create new broadcasted tensor
+    const newSize = outputShape.reduce((acc, dim) => acc * dim, 1);
+    const newData = new Float32Array(newSize);
+
+    // For a tensor of shape [n] being broadcast to [n, m],
+    // we want to repeat each element m times consecutively
+    if (b.shape.length === 1 && outputShape.length === 2) {
+      const n = b.shape[0];
+      const m = outputShape[1];
+
+      for (let i = 0; i < n; i++) {
+        for (let j = 0; j < m; j++) {
+          newData[i * m + j] = b.data[i];
+        }
+      }
+    } else {
+      // General case for broadcasting across multiple dimensions
+      for (let i = 0; i < newSize; i++) {
+        // Convert flat index to coordinates
+        let remaining = i;
+        const coords = [];
+        for (const dim of outputShape) {
+          coords.push(remaining % dim);
+          remaining = Math.floor(remaining / dim);
+        }
+        coords.reverse();
+
+        // Map to input tensor coordinates
+        let inputIdx = 0;
+        let stride = 1;
+        for (let dim = b.shape.length - 1; dim >= 0; dim--) {
+          const outputDim = dim + (outputShape.length - b.shape.length);
+          const coord = coords[outputDim] % b.shape[dim];
+          inputIdx += coord * stride;
+          stride *= b.shape[dim];
+        }
+
+        newData[i] = b.data[inputIdx];
+      }
+    }
+
+    return new Tensor(newData, outputShape, b.requires_grad);
+  }
+
+  async backward(grad_output: Tensor): Promise<Tensor[]> {
+    const [a, b] = this.inputs;
+    const [aRequiresGrad, bRequiresGrad] = this.requiresGrad;
+
+    const grad_a_result = await this.forward(grad_output, b);
+    const grad_a = aRequiresGrad ? grad_a_result[0] : null;
+    if (grad_a !== null) {
+      await a.setGrad(grad_a);
+    }
+
+    const grad_b_result = await this.forward(a, grad_output);
+    const grad_b = bRequiresGrad ? grad_b_result[0] : null;
+    if (grad_b !== null) {
+      await b.setGrad(grad_b);
+    }
+
+    return [grad_a, grad_b].filter(
+      (tensor): tensor is Tensor => tensor !== null,
+    );
+  }
+}
diff --git a/src/ops/mul.ts b/src/ops/mul.ts
index 8f8a3f9..45291f6 100644
--- a/src/ops/mul.ts
+++ b/src/ops/mul.ts
@@ -10,12 +10,18 @@ export class Mul extends BinaryOp {
       if (b.shape.length === 1 && b.shape[0] === 1) {
         // Broadcast scalar
         b = Tensor.full(a.shape, b.data[0], b.requires_grad);
+      } else if (b.shape[0] === 1 && b.shape[1] === a.shape[1]) {
+        // broadcast [1, n] to [m, n]
+        b = Tensor.full(a.shape, b.data[0], b.requires_grad);
       } else {
         throw new Error(
           `Incompatible shapes for Mul: ${a.shape} and ${b.shape}`,
         );
       }
     }
+    if (a.shape.length === 1) {
+      a.shape = [a.shape[0], 1];
+    }
     return b;
   }
 
diff --git a/src/shaders/add.ts b/src/shaders/add.ts
index 599d62f..8e8f565 100644
--- a/src/shaders/add.ts
+++ b/src/shaders/add.ts
@@ -19,4 +19,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     result[row * dimensions.N + col] = a[row * dimensions.N + col] + scalar[row * dimensions.N + col];
   }
 }
-`;
\ No newline at end of file
+`;
diff --git a/src/shaders/div.ts b/src/shaders/div.ts
new file mode 100644
index 0000000..c1b72da
--- /dev/null
+++ b/src/shaders/div.ts
@@ -0,0 +1,22 @@
+export const divShader = `
+struct Dimensions {
+  M: u32,
+  N: u32,
+}
+
+@group(0) @binding(0) var<uniform> dimensions: Dimensions;
+@group(0) @binding(1) var<storage, read> a: array<f32>;
+@group(0) @binding(2) var<storage, read> scalar: array<f32>;
+@group(0) @binding(3) var<storage, read_write> result: array<f32>;
+
+@compute @workgroup_size(64)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+  let global_idx = global_id.x;
+  let row = global_idx / dimensions.N;
+  let col = global_idx % dimensions.N;
+
+  if (global_idx < dimensions.M * dimensions.N) {
+    result[row * dimensions.N + col] = a[row * dimensions.N + col] / scalar[row * dimensions.N + col];
+  }
+}
+`;
diff --git a/src/shaders/exp.ts b/src/shaders/exp.ts
index 420e566..b9b62fc 100644
--- a/src/shaders/exp.ts
+++ b/src/shaders/exp.ts
@@ -18,4 +18,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     result[row * dimensions.N + col] = exp(a[row * dimensions.N + col]);
   }
 }
-`
\ No newline at end of file
+`;
diff --git a/src/shaders/exp2.ts b/src/shaders/exp2.ts
index f1b74c9..cf5c2d1 100644
--- a/src/shaders/exp2.ts
+++ b/src/shaders/exp2.ts
@@ -18,4 +18,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     result[row * dimensions.N + col] = exp2(a[row * dimensions.N + col]);
   }
 }
-`;
\ No newline at end of file
+`;
diff --git a/src/shaders/ln.ts b/src/shaders/ln.ts
index 21822fe..efb75e3 100644
--- a/src/shaders/ln.ts
+++ b/src/shaders/ln.ts
@@ -18,4 +18,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     result[row * dimensions.N + col] = log(a[row * dimensions.N + col]);
   }
 }
-`;
\ No newline at end of file
+`;
diff --git a/src/shaders/log2.ts b/src/shaders/log2.ts
index 2114af9..e85097e 100644
--- a/src/shaders/log2.ts
+++ b/src/shaders/log2.ts
@@ -18,4 +18,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     result[row * dimensions.N + col] = log2(a[row * dimensions.N + col]);
   }
 }
-`;
\ No newline at end of file
+`;
diff --git a/src/shaders/matmul.ts b/src/shaders/matmul.ts
index 926804a..97515c5 100644
--- a/src/shaders/matmul.ts
+++ b/src/shaders/matmul.ts
@@ -326,4 +326,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
         }
     }
 }
-`;
\ No newline at end of file
+`;
diff --git a/src/shaders/mul.ts b/src/shaders/mul.ts
index 642bd51..6bb0053 100644
--- a/src/shaders/mul.ts
+++ b/src/shaders/mul.ts
@@ -19,4 +19,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     result[row * dimensions.N + col] = a[row * dimensions.N + col] * scalar[row * dimensions.N + col];
   }
 }
-`;
\ No newline at end of file
+`;
diff --git a/src/shaders/relu.ts b/src/shaders/relu.ts
index 93a5443..0b6e600 100644
--- a/src/shaders/relu.ts
+++ b/src/shaders/relu.ts
@@ -18,4 +18,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     result[row * dimensions.N + col] = max(a[row * dimensions.N + col], 0);
   }
 }
-`;
\ No newline at end of file
+`;
diff --git a/src/tensor/tensor.ts b/src/tensor/tensor.ts
index 759b9b7..ac90168 100644
--- a/src/tensor/tensor.ts
+++ b/src/tensor/tensor.ts
@@ -6,9 +6,17 @@ import { Log2 } from "../ops/log2.js";
 import { ReLU } from "../ops/relu.js";
 import { Exp2 } from "../ops/exp2.js";
 import { Ln } from "../ops/ln.js";
+import { Div } from "../ops/div.js";
 
 import { AutogradFunction } from "../autograd/function.js";
 
+type SliceArg =
+  | number
+  | [number | null]
+  | [number | null, number | null]
+  | [number | null, number | null, number | null]
+  | ":";
+
 export class Tensor {
   data: Float32Array;
   shape: number[];
@@ -52,6 +60,41 @@ export class Tensor {
     return Tensor.full(tensor.shape, 0, tensor.requires_grad);
   }
 
+  static randn(shape: number[], requires_grad = false) {
+    const data = new Float32Array(shape.reduce((a, b) => a * b));
+
+    for (let i = 0; i < data.length; i++) {
+      data[i] = Math.random() * 2 - 1;
+    }
+
+    return new Tensor(data, shape, requires_grad);
+  }
+
+  static normal(
+    shape: number[],
+    requires_grad = false,
+    initializer_range = 0.01,
+  ) {
+    const data = new Float32Array(shape.reduce((a, b) => a * b));
+
+    for (let i = 0; i < data.length; i++) {
+      data[i] = Math.random() * 2 * initializer_range - initializer_range;
+    }
+
+    return new Tensor(data, shape, requires_grad);
+  }
+
+  static broadcast(tensor: Tensor, size: number, requires_grad = false) {
+    const shape = [size, ...tensor.shape];
+    const data = new Float32Array(shape.reduce((a, b) => a * b));
+
+    for (let i = 0; i < data.length; i++) {
+      data[i] = tensor.data[i % tensor.shape.reduce((a, b) => a * b)];
+    }
+
+    return new Tensor(data, shape, requires_grad);
+  }
+
   async add(tensor: Tensor) {
     const addOp = await Add.create();
 
@@ -64,6 +107,182 @@ export class Tensor {
     return mulOp.forward(this, tensor);
   }
 
+  async sub(tensor: Tensor) {
+    if (tensor.shape.length === 1 && this.shape.length === 2) {
+      // Broadcasting [n] to [m, n]
+      const newShape = [this.shape[0], tensor.shape[0]];
+      tensor = Tensor.full(newShape, tensor.data[0], tensor.requires_grad);
+    }
+
+    const negOne = Tensor.full(tensor.shape, -1, false);
+    const [negTensor] = await tensor.mul(negOne);
+    console.log("this.shape", this.shape);
+    return this.add(negTensor);
+  }
+
+  async mean(dims: number[]): Promise<Tensor> {
+    // Calculate new shape after reduction
+    const shape = this.shape.slice();
+    const size = dims.reduce((acc, dim) => acc * shape[dim], 1);
+
+    dims.sort((a, b) => b - a); // Sort in descending order to remove correctly
+    dims.forEach((dim) => shape.splice(dim, 1));
+    if (shape.length === 0) shape.push(1);
+
+    const result = new Float32Array(shape.reduce((a, b) => a * b, 1));
+
+    // For 1D case
+    if (this.shape.length === 1 && dims.includes(0)) {
+      let sum = 0;
+      for (let i = 0; i < this.data.length; i++) {
+        sum += this.data[i];
+      }
+      result[0] = sum / size;
+      return new Tensor(result, shape, this.requires_grad);
+    }
+
+    // For higher dimensions (keeping existing logic for 2D)
+    const stride = this.shape[1];
+    for (let i = 0; i < this.shape[0]; i++) {
+      let sum = 0;
+      for (let j = 0; j < stride; j++) {
+        sum += this.data[i * stride + j];
+      }
+      result[i] = sum / size;
+    }
+
+    return new Tensor(result, shape, this.requires_grad);
+  }
+
+  async sum(dims: number[]): Promise<Tensor> {
+    const shape = this.shape.slice();
+
+    // Sort dimensions in descending order for correct removal
+    dims.sort((a, b) => b - a);
+    dims.forEach((dim) => shape.splice(dim, 1));
+    if (shape.length === 0) shape.push(1);
+
+    const result = new Float32Array(shape.reduce((a, b) => a * b, 1));
+
+    // Special case: if we're summing all dimensions, just sum everything
+    if (
+      dims.length === this.shape.length ||
+      (this.shape.length === 2 && dims.includes(0) && dims.includes(1))
+    ) {
+      let sum = 0;
+      for (let i = 0; i < this.data.length; i++) {
+        sum += this.data[i];
+      }
+      result[0] = sum;
+      return new Tensor(result, [1], this.requires_grad);
+    }
+
+    // For 1D case
+    if (this.shape.length === 1 && dims.includes(0)) {
+      let sum = 0;
+      for (let i = 0; i < this.data.length; i++) {
+        sum += this.data[i];
+      }
+      result[0] = sum;
+      return new Tensor(result, shape, this.requires_grad);
+    }
+
+    // For 2D case
+    if (this.shape.length === 2) {
+      if (dims.includes(0)) {
+        // Sum along first dimension (vertically)
+        const cols = this.shape[1];
+        const rows = this.shape[0];
+        for (let j = 0; j < cols; j++) {
+          let sum = 0;
+          for (let i = 0; i < rows; i++) {
+            sum += this.data[i * cols + j];
+          }
+          result[j] = sum;
+        }
+      } else if (dims.includes(1)) {
+        // Sum along second dimension (horizontally)
+        const cols = this.shape[1];
+        const rows = this.shape[0];
+        for (let i = 0; i < rows; i++) {
+          let sum = 0;
+          for (let j = 0; j < cols; j++) {
+            sum += this.data[i * cols + j];
+          }
+          result[i] = sum;
+        }
+      }
+    }
+
+    return new Tensor(result, shape, this.requires_grad);
+  }
+
+  async pow(p: number): Promise<[Tensor]> {
+    const result = new Float32Array(this.data.length);
+    for (let i = 0; i < this.data.length; i++) {
+      result[i] = this.data[i] ** p;
+    }
+    return [new Tensor(result, this.shape.slice(), this.requires_grad)];
+  }
+
+  async norm(p: number = 2, dim: number = 0): Promise<[Tensor]> {
+    const [norm] = await this.pow(p);
+    const sumNorm = await norm.sum([dim]);
+    const [rootNorm] = await sumNorm.pow(1 / p);
+    return [rootNorm];
+  }
+
+  async variance(dims: number[]): Promise<Tensor> {
+    const mean = await this.mean(dims);
+    const shape = this.shape.slice();
+    const size = dims.reduce((acc, dim) => acc * shape[dim], 1);
+
+    dims.sort((a, b) => b - a);
+    dims.forEach((dim) => shape.splice(dim, 1));
+    if (shape.length === 0) shape.push(1);
+
+    const result = new Float32Array(shape.reduce((a, b) => a * b, 1));
+
+    // For 1D case
+    if (this.shape.length === 1 && dims.includes(0)) {
+      let sumSquaredDiff = 0;
+      const meanValue = mean.data[0];
+      for (let i = 0; i < this.data.length; i++) {
+        const diff = this.data[i] - meanValue;
+        sumSquaredDiff += diff * diff;
+      }
+      result[0] = sumSquaredDiff / size;
+      return new Tensor(result, shape, this.requires_grad);
+    }
+
+    // For higher dimensions
+    const stride = this.shape[1];
+    for (let i = 0; i < this.shape[0]; i++) {
+      let sumSquaredDiff = 0;
+      const meanValue = mean.data[i];
+      for (let j = 0; j < stride; j++) {
+        const diff = this.data[i * stride + j] - meanValue;
+        sumSquaredDiff += diff * diff;
+      }
+      result[i] = sumSquaredDiff / size;
+    }
+
+    return new Tensor(result, shape, this.requires_grad);
+  }
+
+  async sqrt(): Promise<Tensor> {
+    const result = new Float32Array(this.data.length);
+    for (let i = 0; i < this.data.length; i++) {
+      result[i] = Math.sqrt(this.data[i]);
+    }
+    return new Tensor(result, this.shape.slice(), this.requires_grad);
+  }
+
+  async div(tensor: Tensor): Promise<[Tensor, number]> {
+    const divOp = await Div.create();
+    return divOp.forward(this, tensor);
+  }
+
   async matmul(tensor: Tensor) {
     const matmulOp = await MatMul.create();
 
@@ -100,6 +319,30 @@ export class Tensor {
     return reluOp.forward(this);
   }
 
+  async gather(indices: Tensor): Promise<[Tensor, number]> {
+    // For input shape [batch_size] and embedding matrix [vocab_size, embedding_dim]
+    // We want output shape [batch_size, embedding_dim]
+    const batchSize = indices.shape[0];
+    const embeddingDim = this.shape[1];
+    const result = new Float32Array(batchSize * embeddingDim);
+
+    // For each item in the batch
+    for (let i = 0; i < batchSize; i++) {
+      const tokenId = indices.data[i];
+      // Copy the entire embedding vector for this token
+      const sourceOffset = tokenId * embeddingDim;
+      const targetOffset = i * embeddingDim;
+      for (let j = 0; j < embeddingDim; j++) {
+        result[targetOffset + j] = this.data[sourceOffset + j];
+      }
+    }
+
+    return [
+      new Tensor(result, [batchSize, embeddingDim], indices.requires_grad),
+      -1,
+    ];
+  }
+
   transpose() {
     const [rows, cols] = this.shape;
     const transposedData = new Float32Array(this.data.length);
@@ -163,4 +406,296 @@ export class Tensor {
 
     return topo_order;
   }
+
+  async concat(tensor: Tensor, axis: number): Promise<Tensor> {
+    // Validate axis
+    if (axis < 0 || axis >= this.shape.length) {
+      throw new Error(
+        `Invalid axis ${axis}. Must be between 0 and ${this.shape.length - 1}`,
+      );
+    }
+
+    // For axis 0 concatenation, all other dimensions must match exactly
+    if (axis === 0) {
+      // For 1D tensors, they must have the same shape
+      if (this.shape.length === 1 && this.shape[0] !== tensor.shape[0]) {
+        throw new Error(
+          `Shape mismatch: tensors have different shapes at non-concatenating dimensions`,
+        );
+      }
+    }
+
+    // For other axes, validate shapes - all dimensions except concat axis must match
+    for (let i = 0; i < this.shape.length; i++) {
+      if (i !== axis && this.shape[i] !== tensor.shape[i]) {
+        throw new Error(
+          `Shape mismatch: tensors have different shapes at non-concatenating dimensions`,
+        );
+      }
+    }
+
+    // Calculate new shape
+    const newShape = [...this.shape];
+    newShape[axis] += tensor.shape[axis];
+
+    // Create new data array
+    const newData = new Float32Array(newShape.reduce((a, b) => a * b));
+
+    // Calculate strides for both tensors
+    const stride = this.shape[axis];
+    const preAxisSize = this.shape.slice(0, axis).reduce((a, b) => a * b, 1);
+    const postAxisSize = this.shape.slice(axis + 1).reduce((a, b) => a * b, 1);
+
+    // Copy data from both tensors
+    for (let i = 0; i < preAxisSize; i++) {
+      for (let j = 0; j < postAxisSize; j++) {
+        // Copy from first tensor
+        for (let k = 0; k < this.shape[axis]; k++) {
+          const srcIdx = i * stride * postAxisSize + k * postAxisSize + j;
+          const dstIdx =
+            i * (stride + tensor.shape[axis]) * postAxisSize +
+            k * postAxisSize +
+            j;
+          newData[dstIdx] = this.data[srcIdx];
+        }
+        // Copy from second tensor
+        for (let k = 0; k < tensor.shape[axis]; k++) {
+          const srcIdx =
+            i * tensor.shape[axis] * postAxisSize + k * postAxisSize + j;
+          const dstIdx =
+            i * (stride + tensor.shape[axis]) * postAxisSize +
+            (k + stride) * postAxisSize +
+            j;
+          newData[dstIdx] = tensor.data[srcIdx];
+        }
+      }
+    }
+
+    return new Tensor(
+      newData,
+      newShape,
+      this.requires_grad || tensor.requires_grad,
+    );
+  }
+  async slice(...args: SliceArg[]): Promise<Tensor> {
+    if (args.length > this.shape.length) {
+      throw new Error(
+        `Too many indices for tensor of dimension ${this.shape.length}`,
+      );
+    }
+
+    // Convert all arguments to normalized slice specs
+    const slices = args.map((arg, dim) =>
+      this.normalizeSlice(arg, this.shape[dim]),
+    );
+
+    // Calculate output shape and stride info
+    const { outputShape, isReducedDim } = this.calculateOutputShape(
+      slices,
+      this.shape,
+    );
+
+    // Handle empty result case
+    if (outputShape.length === 0 || outputShape.some((dim) => dim === 0)) {
+      return new Tensor(new Float32Array(0), outputShape, this.requires_grad);
+    }
+
+    // Create output tensor
+    const outputSize = outputShape.reduce((a, b) => a * b, 1);
+    const result = new Float32Array(outputSize);
+
+    // For each output position, calculate corresponding input position
+    await this.populateSlicedData(
+      result,
+      outputSize,
+      outputShape,
+      slices,
+      isReducedDim,
+    );
+
+    return new Tensor(result, outputShape, this.requires_grad);
+  }
+
+  private async populateSlicedData(
+    result: Float32Array,
+    outputSize: number,
+    outputShape: number[],
+    slices: [number, number, number][],
+    isReducedDim: boolean[],
+  ): Promise<void> {
+    // Process in chunks to avoid blocking the main thread
+    const CHUNK_SIZE = 1000;
+
+    for (let i = 0; i < outputSize; i += CHUNK_SIZE) {
+      const end = Math.min(i + CHUNK_SIZE, outputSize);
+
+      for (let j = i; j < end; j++) {
+        const outputCoords = this.indexToCoords(j, outputShape);
+        const inputCoords = this.mapToInputCoords(
+          outputCoords,
+          slices,
+          isReducedDim,
+        );
+        const inputIndex = this.coordsToIndex(inputCoords, this.shape);
+        result[j] = this.data[inputIndex];
+      }
+
+      // Yield to event loop periodically
+      if (end < outputSize) {
+        await new Promise((resolve) => setTimeout(resolve, 0));
+      }
+    }
+  }
+
+  private calculateOutputShape(
+    slices: [number, number, number][],
+    inputShape: number[],
+  ) {
+    // Pad slices to match input dimensions
+    const fullSlices = [...slices];
+    while (fullSlices.length < inputShape.length) {
+      fullSlices.push([0, inputShape[fullSlices.length], 1]);
+    }
+
+    // Track which dimensions are being reduced (single number index)
+    const isReducedDim = fullSlices.map(
+      ([start, end, step]) => end - start === 1 && step === 1,
+    );
+
+    // Calculate output shape, handling both positive and negative steps
+    const outputShape = fullSlices
+      .map(([start, end, step], i) => {
+        if (isReducedDim[i]) return 0;
+
+        if (step > 0) {
+          return Math.max(0, Math.ceil((end - start) / step));
+        } else {
+          // For negative steps, we need to handle the range differently
+          // When going backwards, we need to include the start position
+          const numElements = Math.max(
+            0,
+            Math.ceil((start - end + 1) / Math.abs(step)),
+          );
+          return numElements;
+        }
+      })
+      .filter((size) => size !== 0);
+
+    return { outputShape, isReducedDim };
+  }
+
+  private normalizeSlice(
+    arg: SliceArg,
+    dimSize: number,
+  ): [number, number, number] {
+    // Handle single number index
+    if (typeof arg === "number") {
+      const idx = arg < 0 ? dimSize + arg : arg;
+      if (idx < 0 || idx >= dimSize) {
+        throw new Error(
+          `Index ${arg} is out of bounds for dimension ${dimSize}`,
+        );
+      }
+      return [idx, idx + 1, 1];
+    }
+
+    // Handle full slice
+    if (arg === ":") {
+      return [0, dimSize, 1];
+    }
+
+    // Handle array spec [start, end, step]
+    let [start, end, step] = arg as [
+      number | null,
+      number | null,
+      number | null,
+    ];
+    step = step ?? 1;
+
+    if (step === 0) {
+      throw new Error("Slice step cannot be zero");
+    }
+
+    // Handle negative step
+    if (step < 0) {
+      // Default start is end of dimension for negative step
+      start = start ?? dimSize - 1;
+      // Default end is before beginning of dimension
+      end = end ?? -1;
+
+      // Convert negative indices to positive
+      start = start < 0 ? dimSize + start : start;
+      // For negative step, don't convert negative end index if it's the default -1
+      end = end < 0 && end !== -1 ? dimSize + end : end;
+
+      // Clamp to valid range for negative step
+      start = Math.min(dimSize - 1, Math.max(0, start));
+      end = Math.min(dimSize - 1, Math.max(0, end));
+    } else {
+      // Default start is beginning of dimension for positive step
+      start = start ?? 0;
+      // Default end is end of dimension
+      end = end ?? dimSize;
+
+      // Convert negative indices to positive
+      start = start < 0 ? dimSize + start : start;
+      end = end < 0 ? dimSize + end : end;
+
+      // Clamp to valid range
+      start = Math.min(dimSize - 1, Math.max(0, start));
+      end = Math.min(dimSize, Math.max(0, end));
+    }
+
+    return [start, end, step];
+  }
+
+  private indexToCoords(index: number, shape: number[]): number[] {
+    const coords = [];
+    let remaining = index;
+    let stride = shape.reduce((a, b) => a * b, 1);
+
+    for (const dimSize of shape) {
+      stride = stride / dimSize;
+      const coord = Math.floor(remaining / stride);
+      remaining = remaining % stride;
+      coords.push(coord);
+    }
+
+    return coords;
+  }
+
+  private mapToInputCoords(
+    outputCoords: number[],
+    slices: [number, number, number][],
+    isReducedDim: boolean[],
+  ): number[] {
+    const inputCoords: number[] = [];
+    let outputIdx = 0;
+
+    for (let i = 0; i < isReducedDim.length; i++) {
+      if (isReducedDim[i]) {
+        // For reduced dimensions, use the start index
+        inputCoords.push(slices[i][0]);
+      } else {
+        // For slice dimensions, calculate the actual position
+        const [start, , step] = slices[i];
+        inputCoords.push(start + outputCoords[outputIdx] * step);
+        outputIdx++;
+      }
+    }
+
+    return inputCoords;
+  }
+
+  private coordsToIndex(coords: number[], shape: number[]): number {
+    let index = 0;
+    let stride = 1;
+
+    for (let i = coords.length - 1; i >= 0; i--) {
+      index += coords[i] * stride;
+      stride *= shape[i];
+    }
+
+    return index;
+  }
 }
diff --git a/tests/integration/attention.test.ts b/tests/integration/attention.test.ts
new file mode 100644
index 0000000..b8ca9bf
--- /dev/null
+++ b/tests/integration/attention.test.ts
@@ -0,0 +1,131 @@
+import { test, expect } from "@playwright/test";
+
+test("MultiHeadAttention forward pass with known values", async ({ page }) => {
+  await page.goto("http://localhost:8080");
+
+  page.on("console", (msg) => {
+    console.log(msg);
+  });
+
+  // Inject test function
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      // @ts-expect-error ignore error for tests
+      import("/dist/bundle.js").then((module) => {
+        const { Tensor, MultiHeadAttention } = module;
+
+        window.runAttentionTest = async function () {
+          // Create sample input tensor with known values
+          const seqLength = 2;
+          const hiddenDim = 4;
+          const numHeads = 2;
+
+          const input = new Tensor(
+            new Float32Array([
+              0.1,
+              0.2,
+              0.3,
+              0.4, // First sequence
+              0.5,
+              0.6,
+              0.7,
+              0.8, // Second sequence
+            ]),
+            [seqLength, hiddenDim],
+            false,
+          );
+
+          // Create MultiHeadAttention
+          const attention = new MultiHeadAttention(hiddenDim, numHeads);
+
+          // Set known weights and biases for reproducibility
+          attention.qkv.weight = new Tensor(
+            new Float32Array([
+              // Q weights
+              0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.3, 0.4, 0.5, 0.6, 0.4,
+              0.5, 0.6, 0.7,
+              // K weights
+              0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4,
+              0.4, 0.4, 0.4,
+              // V weights
+              0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8,
+              0.8, 0.8, 0.8,
+            ]),
+            [hiddenDim, hiddenDim * 3],
+            true,
+          );
+
+          attention.qkv.bias = new Tensor(
+            new Float32Array([
+              // Q bias
+              0.1, 0.1, 0.1, 0.1,
+              // K bias
+              0.2, 0.2, 0.2, 0.2,
+              // V bias
+              0.3, 0.3, 0.3, 0.3,
+            ]),
+            [hiddenDim * 3],
+            true,
+          );
+
+          attention.output.weight = new Tensor(
+            new Float32Array([
+              0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.3, 0.4, 0.5, 0.6, 0.4,
+              0.5, 0.6, 0.7,
+            ]),
+            [hiddenDim, hiddenDim],
+            true,
+          );
+
+          attention.output.bias = new Tensor(
+            new Float32Array([0.1, 0.1, 0.1, 0.1]),
+            [hiddenDim],
+            true,
+          );
+
+          // Forward pass
+          const [output] = await attention.forward(input);
+
+          return {
+            inputShape: input.shape,
+            inputData: Array.from(input.data),
+            outputShape: output.shape,
+            outputData: Array.from(output.data),
+          };
+        };
+        resolve();
+      });
+    });
+  });
+
+  // Run the test function in the browser context
+  const result = await page.evaluate(() => window.runAttentionTest());
+
+  // Validate shapes
+  expect(result.inputShape).toEqual([2, 4]); // [seq_len, hidden_dim]
+  expect(result.outputShape).toEqual([2, 4]); // [seq_len, hidden_dim]
+  console.log("result.outputData:", result.outputData.toString());
+
+  // Expected values computed using the same architecture with PyTorch
+  const expectedOutput = [
+    1.4622, 1.9985, 2.5347, 3.0709, 1.5701, 2.1462, 2.7224, 3.2985,
+  ];
+
+  // Validate output values
+  result.outputData.forEach((value, idx) => {
+    expect(value).toBeCloseTo(expectedOutput[idx], 4);
+  });
+
+  await page.close();
+});
+
+declare global {
+  interface Window {
+    runAttentionTest: () => Promise<{
+      inputShape: number[];
+      inputData: number[];
+      outputShape: number[];
+      outputData: number[];
+    }>;
+  }
+}
diff --git a/tests/integration/div.test.ts b/tests/integration/div.test.ts
new file mode 100644
index 0000000..277dc17
--- /dev/null
+++ b/tests/integration/div.test.ts
@@ -0,0 +1,67 @@
+import { test, expect } from "@playwright/test";
+
+test("Elementwise scalar/broadcasted division forward and backward pass", async ({
+  page,
+}) => {
+  await page.goto("http://localhost:8080");
+
+  page.on("console", (msg) => {
+    console.log(msg);
+  });
+
+  // Inject your test function
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      // @ts-expect-error ignore error for tests
+      import("/dist/bundle.js").then((module) => {
+        const { Tensor } = module;
+
+        // @ts-expect-error ignore error for tests
+        window.runDivTest = async function () {
+          const x = new Tensor(
+            new Float32Array([2.0, 4.0, 6.0, 8.0, 10.0, 12.0]),
+            [2, 3],
+            true,
+          );
+          const y = new Tensor(new Float32Array([2.0]), [1], false);
+
+          // Forward pass
+          const [z] = await x.div(y);
+
+          await z.backward();
+
+          return {
+            x: x,
+            y: y,
+            z: z,
+            grad_x: x.grad,
+            grad_y: y.grad,
+          };
+        };
+        resolve();
+      });
+    });
+  });
+
+  // Run the test function in the browser context
+  // @ts-expect-error ignore error for tests
+  const result = await page.evaluate(() => window.runDivTest());
+
+  // Perform assertions
+  expect(result.x.shape).toEqual([2, 3]);
+  expect(result.y.shape).toEqual([1]);
+  expect(result.z.shape).toEqual([2, 3]);
+  expect(result.grad_x.shape).toEqual([2, 3]);
+  // check that grad_y is undefined
+  expect(result.grad_y).toBeNull();
+
+  const zData = new Float32Array(Object.values(result.z.data));
+  const gradXData = new Float32Array(Object.values(result.grad_x.data));
+
+  expect(zData).toEqual(new Float32Array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]));
+
+  // For division, gradient with respect to x is 1/y
+  expect(gradXData).toEqual(new Float32Array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5]));
+
+  await page.close();
+});
diff --git a/tests/integration/embedding.test.ts b/tests/integration/embedding.test.ts
new file mode 100644
index 0000000..8eae819
--- /dev/null
+++ b/tests/integration/embedding.test.ts
@@ -0,0 +1,76 @@
+import { test, expect } from "@playwright/test";
+
+test("Embedding forward pass with known values", async ({ page }) => {
+  await page.goto("http://localhost:8080");
+
+  page.on("console", (msg) => {
+    console.log(msg);
+  });
+
+  // Inject test function
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      // @ts-expect-error ignore error for tests
+      import("/dist/bundle.js").then((module) => {
+        const { Tensor, Embedding } = module;
+
+        window.runEmbeddingTest = async function () {
+          const vocabSize = 128;
+          const embeddingDim = 2; // Using small dim for easy verification
+
+          // Create embedding layer
+          const embedding = new Embedding(vocabSize, embeddingDim);
+          console.log(embedding);
+
+          // Create input tensor with indices
+          const inputIndices = new Tensor(
+            new Float32Array([1, 5, 10]), // Sample indices
+            [3], // Sequence length of 3
+            false,
+          );
+
+          // Forward pass
+          const [embeddings] = await embedding.forward(inputIndices);
+
+          return {
+            inputIndices: Array.from(inputIndices.data),
+            embedding: embedding.embedding,
+            outputShape: embeddings.shape,
+            outputData: Array.from(embeddings.data),
+          };
+        };
+        resolve();
+      });
+    });
+  });
+
+  // Run the test function in the browser context
+  const result = await page.evaluate(() => window.runEmbeddingTest());
+
+  // Validate shapes
+  expect(result.outputShape).toEqual([3, 2]); // Sequence length x Embedding dim
+
+  const expectedOutput = [
+    result.embedding.data[2],
+    result.embedding.data[3],
+    result.embedding.data[10],
+    result.embedding.data[11],
+    result.embedding.data[20],
+    result.embedding.data[21],
+  ];
+
+  expect(result.outputData).toEqual(expectedOutput);
+
+  await page.close();
+});
+
+declare global {
+  interface Window {
+    runEmbeddingTest: () => Promise<{
+      inputIndices: number[];
+      embedding: { data: number[] };
+      outputShape: number[];
+      outputData: number[];
+    }>;
+  }
+}
diff --git a/tests/integration/gather.test.ts b/tests/integration/gather.test.ts
new file mode 100644
index 0000000..58f9470
--- /dev/null
+++ b/tests/integration/gather.test.ts
@@ -0,0 +1,85 @@
+import { test, expect } from "@playwright/test";
+
+test("Gather forward and backward pass", async ({ page }) => {
+  await page.goto("http://localhost:8080");
+
+  page.on("console", (msg) => {
+    console.log(msg);
+  });
+
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      // @ts-expect-error ignore error for tests
+      import("/dist/bundle.js").then((module) => {
+        const { Tensor } = module;
+
+        // @ts-expect-error ignore error for tests
+        window.runGatherTest = async function () {
+          // Create a simple embedding matrix with 3 embeddings of dimension 2
+          const embeddings = new Tensor(
+            new Float32Array([
+              1.0,
+              2.0, // embedding 0
+              3.0,
+              4.0, // embedding 1
+              5.0,
+              6.0, // embedding 2
+            ]),
+            [3, 2],
+            true,
+          );
+
+          // Look up embeddings at indices 1, 0 (second embedding, then first)
+          const indices = new Tensor(new Float32Array([1, 0]), [2, 1], false);
+
+          // Forward pass - gather embeddings
+          const [output] = await embeddings.gather(indices);
+
+          // Backward pass
+          await output.backward();
+
+          return {
+            embeddings: embeddings,
+            indices: indices,
+            output: output,
+            grad_embeddings: embeddings.grad,
+          };
+        };
+        resolve();
+      });
+    });
+  });
+
+  // Run the test function in the browser context
+  // @ts-expect-error ignore error for tests
+  const result = await page.evaluate(() => window.runGatherTest());
+
+  expect(result.output.shape).toEqual([2, 2]); // 2 selected embeddings of dimension 2
+  expect(result.grad_embeddings.shape).toEqual([3, 2]); // Same shape as input embeddings
+
+  // Forward pass assertions - should get embeddings at indices 1 and 0
+  const outputData = new Float32Array(Object.values(result.output.data));
+  expect(outputData).toEqual(
+    new Float32Array([
+      3.0,
+      4.0, // embedding at index 1
+      1.0,
+      2.0, // embedding at index 0
+    ]),
+  );
+
+  // Backward pass assertions - gradient should accumulate at the selected indices
+  const gradData = new Float32Array(Object.values(result.grad_embeddings.data));
+  expect(gradData).toEqual(
+    new Float32Array([
+      1.0,
+      1.0, // gradient for embedding 0 (selected second)
+      1.0,
+      1.0, // gradient for embedding 1 (selected first)
+      0.0,
+      0.0, // gradient for embedding 2 (not selected)
+    ]),
+  );
+
+  await page.close();
+});
diff --git a/tests/integration/linear.test.ts b/tests/integration/linear.test.ts
new file mode 100644
index 0000000..c21cfc4
--- /dev/null
+++ b/tests/integration/linear.test.ts
@@ -0,0 +1,87 @@
+import { test, expect } from "@playwright/test";
+
+test("Linear forward pass with known values", async ({ page }) => {
+  await page.goto("http://localhost:8080");
+
+  page.on("console", (msg) => {
+    console.log(msg);
+  });
+
+  // Inject test function
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      // @ts-expect-error ignore error for tests
+      import("/dist/bundle.js").then((module) => {
+        const { Tensor, Linear } = module;
+
+        window.runLinearTest = async function () {
+          const inputSize = 3;
+          const outputSize = 2;
+
+          // Create linear layer
+          const linear = new Linear(inputSize, outputSize);
+
+          // Set known weights and biases for deterministic testing
+          linear.weight.data = new Float32Array([
+            0.1,
+            0.2, // First row
+            0.3,
+            0.4, // Second row
+            0.5,
+            0.6, // Third row
+          ]);
+
+          linear.bias.data = new Float32Array([0.1, 0.2]);
+
+          // Create input tensor
+          const input = new Tensor(
+            new Float32Array([1.0, 2.0, 3.0]), // Sample input
+            [1, 3], // Batch size 1, input size 3
+            false,
+          );
+
+          // Forward pass
+          const [output] = await linear.forward(input);
+
+          return {
+            inputData: Array.from(input.data),
+            weights: Array.from(linear.weight.data),
+            biases: Array.from(linear.bias.data),
+            outputShape: output.shape,
+            outputData: Array.from(output.data),
+          };
+        };
+        resolve();
+      });
+    });
+  });
+
+  // Run the test function in the browser context
+  const result = await page.evaluate(() => window.runLinearTest());
+
+  // Validate shapes
+  expect(result.outputShape).toEqual([1, 2]); // Batch size x Output size
+
+  // Calculate expected output manually:
+  // output[0] = (1.0 * 0.1 + 2.0 * 0.3 + 3.0 * 0.5) + 0.1 = 2.0
+  // output[1] = (1.0 * 0.2 + 2.0 * 0.4 + 3.0 * 0.6) + 0.2 = 2.8
+  const expectedOutput = [2.3, 3.0];
+
+  // Check if outputs match expected values within a small tolerance
+  expect(result.outputData[0]).toBeCloseTo(expectedOutput[0], 5);
+  expect(result.outputData[1]).toBeCloseTo(expectedOutput[1], 5);
+
+  await page.close();
+});
+
+declare global {
+  interface Window {
+    runLinearTest: () => Promise<{
+      inputData: number[];
+      weights: number[];
+      biases: number[];
+      outputShape: number[];
+      outputData: number[];
+    }>;
+  }
+}
diff --git a/tests/integration/mlp.test.ts b/tests/integration/mlp.test.ts
new file mode 100644
index 0000000..e329c1b
--- /dev/null
+++ b/tests/integration/mlp.test.ts
@@ -0,0 +1,131 @@
+import { test, expect } from "@playwright/test";
+
+test("MLP with SwiGLU activation forward pass with known values", async ({
+  page,
+}) => {
+  await page.goto("http://localhost:8080");
+
+  page.on("console", (msg) => {
+    console.log(msg);
+  });
+
+  // Inject test function
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      // @ts-expect-error ignore error for tests
+      import("/dist/bundle.js").then((module) => {
+        const { Tensor, MLP } = module;
+
+        window.runSwiGLUTest = async function () {
+          // Create sample input tensor with known values
+          const inputDim = 4;
+          const hiddenDim = 8; // Will be doubled internally for SwiGLU
+          const seqLength = 2;
+
+          const input = new Tensor(
+            new Float32Array([
+              0.1,
+              0.2,
+              0.3,
+              0.4, // First sequence
+              0.5,
+              0.6,
+              0.7,
+              0.8, // Second sequence
+            ]),
+            [seqLength, inputDim],
+            false,
+          );
+
+          // Create MLP with SwiGLU activation
+          const mlp = new MLP(inputDim, hiddenDim, "swiglu");
+
+          // Set known weights and biases for reproducibility
+          mlp.up.weight = new Tensor(
+            new Float32Array([
+              // First half for gate
+              0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.2, 0.3, 0.4, 0.5, 0.6,
+              0.7, 0.8, 0.9, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.4, 0.5,
+              0.6, 0.7, 0.8, 0.9, 1.0, 1.1,
+              // Second half for value
+              0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2,
+              0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4,
+              0.4, 0.4, 0.4, 0.4, 0.4, 0.4,
+            ]),
+            [inputDim, hiddenDim * 2],
+            true,
+          );
+
+          mlp.up.bias = new Tensor(
+            new Float32Array([
+              // Gate bias
+              0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
+              // Value bias
+              0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+            ]),
+            [hiddenDim * 2],
+            true,
+          );
+
+          mlp.down.weight = new Tensor(
+            new Float32Array([
+              0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.3, 0.4, 0.5, 0.6, 0.4,
+              0.5, 0.6, 0.7, 0.5, 0.6, 0.7, 0.8, 0.6, 0.7, 0.8, 0.9, 0.7, 0.8,
+              0.9, 1.0, 0.8, 0.9, 1.0, 1.1,
+            ]),
+            [hiddenDim, inputDim],
+            true,
+          );
+
+          mlp.down.bias = new Tensor(
+            new Float32Array([0.1, 0.1, 0.1, 0.1]),
+            [inputDim],
+            true,
+          );
+
+          // Forward pass
+          const [output] = await mlp.forward(input);
+
+          return {
+            inputShape: input.shape,
+            inputData: Array.from(input.data),
+            outputShape: output.shape,
+            outputData: Array.from(output.data),
+          };
+        };
+        resolve();
+      });
+    });
+  });
+
+  // Run the test function in the browser context
+  const result = await page.evaluate(() => window.runSwiGLUTest());
+
+  // Validate shapes
+  expect(result.inputShape).toEqual([2, 4]); // [batch_size, input_dim]
+  expect(result.outputShape).toEqual([2, 4]); // [batch_size, input_dim]
+  console.log("result.outputData:", result.outputData.toString());
+
+  // Expected values computed using the same architecture with PyTorch
+  const expectedOutput = [
+    0.7809, 0.9126, 1.0443, 1.176, 5.0712, 5.9646, 6.8581, 7.7515,
+  ];
+
+  // Validate output values
+  result.outputData.forEach((value, idx) => {
+    expect(value).toBeCloseTo(expectedOutput[idx], 4);
+  });
+
+  await page.close();
+});
+
+declare global {
+  interface Window {
+    runSwiGLUTest: () => Promise<{
+      inputShape: number[];
+      inputData: number[];
+      outputShape: number[];
+      outputData: number[];
+    }>;
+  }
+}
diff --git a/tests/integration/nomic_embed.test.ts b/tests/integration/nomic_embed.test.ts
new file mode 100644
index 0000000..432ea9c
--- /dev/null
+++ b/tests/integration/nomic_embed.test.ts
@@ -0,0 +1,127 @@
+import { test, expect } from "@playwright/test";
+
+test("NomicEmbed forward pass with known values", async ({ page }) => {
+  await page.goto("http://localhost:8080");
+
+  page.on("console", (msg) => {
+    console.log(msg);
+  });
+
+  // Inject test function
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      // @ts-expect-error ignore error for tests
+      import("/dist/bundle.js").then((module) => {
+        const { Tensor, NomicEmbed } = module;
+
+        window.runNomicEmbedTest = async function () {
+          // Create configuration matching the HF config
+          const config = {
+            vocab_size: 30528,
+            hidden_size: 768,
+            num_hidden_layers: 2,
+            num_attention_heads: 2,
+            intermediate_size: 3072,
+            hidden_act: "swiglu",
+            hidden_dropout_prob: 0.0,
+            attention_probs_dropout_prob: 0.0,
+            max_position_embeddings: 8192,
+            type_vocab_size: 2,
+            initializer_range: 0.02,
+            layer_norm_eps: 1e-12,
+            pad_token_id: 0,
+            position_embedding_type: "rotary",
+            use_cache: true,
+            classifier_dropout: null,
+            rotary_emb_fraction: 1.0,
+            qkv_proj_bias: false,
+            mlp_fc1_bias: false,
+            mlp_fc2_bias: false,
+            causal: false,
+          };
+
+          // Create sample input tensors
+          const seqLength = 1; // Small sequence for testing
+
+          // Create input IDs tensor with some token IDs
+          const inputIds = new Tensor(
+            new Float32Array([1]),
+            [seqLength],
+            false,
+          );
+
+          // Create attention mask (all 1s for no masking)
+          const attentionMask = new Tensor(
+            new Float32Array([1]),
+            [seqLength],
+            false,
+          );
+
+          // Create position IDs (optional)
+          const positionIds = new Tensor(
+            new Float32Array([0]),
+            [seqLength],
+            false,
+          );
+
+          // Create token type IDs (optional)
+          const tokenTypeIds = new Tensor(
+            new Float32Array([0]),
+            [seqLength],
+            false,
+          );
+
+          // Initialize model
+          const model = new NomicEmbed(config);
+
+          // Forward pass
+          const [output] = await model.forward(
+            inputIds,
+            attentionMask,
+            positionIds,
+            tokenTypeIds,
+          );
+
+          return {
+            inputShape: inputIds.shape,
+            outputShape: output.shape,
+            outputData: Array.from(output.data),
+          };
+        };
+        resolve();
+      });
+    });
+  });
+
+  // Run the test function in the browser context
+  const result = await page.evaluate(() => window.runNomicEmbedTest());
+
+  // Test input shape
+  expect(result.inputShape).toEqual([1]); // [sequence_length]
+
+  // Test output shape - should be [hidden_size] after pooling and normalization
+  expect(result.outputShape).toEqual([768]); // [hidden_size]
+
+  // Verify output is normalized (L2 norm should be close to 1)
+  const l2Norm = Math.sqrt(
+    result.outputData.reduce((sum, val) => sum + val * val, 0),
+  );
+  expect(l2Norm).toBeCloseTo(1, 6);
+
+  // Verify output values are within reasonable range
+  result.outputData.forEach((value) => {
+    expect(Math.abs(value)).toBeLessThan(1); // Normalized values should be < 1
+  });
+
+  await page.close();
+});
+
+declare global {
+  interface Window {
+    runNomicEmbedTest: () => Promise<{
+      inputShape: number[];
+      outputShape: number[];
+      outputData: number[];
+    }>;
+  }
+}
diff --git a/tests/integration/norm.test.ts b/tests/integration/norm.test.ts
new file mode 100644
index 0000000..580ef7b
--- /dev/null
+++ b/tests/integration/norm.test.ts
@@ -0,0 +1,86 @@
+import { test, expect } from "@playwright/test";
+
+test("LayerNorm forward pass with known values", async ({ page }) => {
+  await page.goto("http://localhost:8080");
+
+  page.on("console", (msg) => {
+    console.log(msg);
+  });
+
+  // Inject test function
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      // @ts-expect-error ignore error for tests
+      import("/dist/bundle.js").then((module) => {
+        const { Tensor, LayerNorm } = module;
+
+        window.runLayerNormTest = async function () {
+          // Create a simple input tensor with known values
+          const input = new Tensor(
+            new Float32Array([1, 2, 3, 4, 5, 6]), // Sample values
+            [2, 3], // 2 sequences, 3 features each
+            false,
+          );
+
+          // Create LayerNorm with normalized_shape [3]
+          const layerNorm = new LayerNorm([3], 1e-5);
+
+          // Set known values for gamma and beta
+          layerNorm.gamma.data.set([1.0, 1.0, 1.0]);
+          layerNorm.beta.data.set([0.0, 0.0, 0.0]);
+
+          // Forward pass
+          const [output] = await layerNorm.forward(input);
+
+          return {
+            inputData: Array.from(input.data),
+            inputShape: input.shape,
+            outputShape: output.shape,
+            outputData: Array.from(output.data),
+            gamma: Array.from(layerNorm.gamma.data),
+            beta: Array.from(layerNorm.beta.data),
+          };
+        };
+        resolve();
+      });
+    });
+  });
+
+  // Run the test function in the browser context
+  const result = await page.evaluate(() => window.runLayerNormTest());
+
+  // Validate shapes
+  expect(result.inputShape).toEqual([2, 3]);
+  expect(result.outputShape).toEqual([2, 3]);
+
+  // For the input [1,2,3] and [4,5,6], with gamma=1 and beta=0,
+  // we can pre-calculate the expected normalized values
+  const expectedOutput = [
+    -1.224744871391589,
+    0,
+    1.224744871391589, // First sequence normalized
+    -1.224744871391589,
+    0,
+    1.224744871391589, // Second sequence normalized
+  ];
+
+  // Check if output matches expected values (using approximate equality)
+  result.outputData.forEach((val, idx) => {
+    expect(val).toBeCloseTo(expectedOutput[idx], 4);
+  });
+
+  await page.close();
+});
+
+declare global {
+  interface Window {
+    runLayerNormTest: () => Promise<{
+      inputData: number[];
+      inputShape: number[];
+      outputShape: number[];
+      outputData: number[];
+      gamma: number[];
+      beta: number[];
+    }>;
+  }
+}
diff --git a/tests/integration/rotary.test.ts b/tests/integration/rotary.test.ts
new file mode 100644
index 0000000..e382422
--- /dev/null
+++ b/tests/integration/rotary.test.ts
@@ -0,0 +1,121 @@
+import { test, expect } from "@playwright/test";
+
+test("Rotary positional embedding forward pass with known values", async ({
+  page,
+}) => {
+  await page.goto("http://localhost:8080");
+
+  page.on("console", (msg) => {
+    console.log(msg);
+  });
+
+  // Inject test function
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      // @ts-expect-error ignore error for tests
+      import("/dist/bundle.js").then((module) => {
+        const { Tensor, RotaryEmbedding } = module;
+
+        window.runRotaryEmbeddingTest = async function () {
+          const seqLength = 4;
+          const dimension = 8; // Must be divisible by 2 for rotary embeddings
+          const base = 10000.0;
+
+          // Create rotary embedding layer
+          const rotaryEmbed = new RotaryEmbedding(base, dimension);
+
+          // Create sample input tensor
+          const input = new Tensor(
+            new Float32Array([
+              0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.2, 0.3, 0.4, 0.5, 0.6,
+              0.7, 0.8, 0.9, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.4, 0.5,
+              0.6, 0.7, 0.8, 0.9, 1.0, 1.1,
+            ]),
+            [seqLength, dimension],
+            false,
+          );
+
+          // Forward pass
+          const [rotatedOutput] = await rotaryEmbed.forward(input);
+          console.log("rotatedOutput", rotatedOutput.data.toString());
+
+          // Get the theta values and position encodings for verification
+          const theta = rotaryEmbed.createTheta(dimension, base);
+          const [cache] = await rotaryEmbed.buildCache(seqLength);
+
+          return {
+            inputShape: input.shape,
+            inputData: Array.from(input.data),
+            outputShape: rotatedOutput.shape,
+            outputData: Array.from(rotatedOutput.data),
+            theta: Array.from(theta),
+            cache: Array.from(cache.data),
+            idxTheta: Array.from(rotaryEmbed.idxTheta),
+          };
+        };
+        resolve();
+      });
+    });
+  });
+
+  // Run the test function in the browser context
+  const result = await page.evaluate(() => window.runRotaryEmbeddingTest());
+
+  // Validate shapes
+  expect(result.inputShape).toEqual([4, 8]);
+  expect(result.outputShape).toEqual([4, 8]);
+
+  // Validate theta calculation
+  const expectedTheta = Array.from([1.0, 0.1, 0.01, 0.001]);
+
+  result.theta.forEach((value, idx) => {
+    expect(value).toBeCloseTo(expectedTheta[idx], 4);
+  });
+
+  const expectedIdxTheta = Array.from([
+    0.0, 0.0, 0.0, 0.0, 1.0, 1.0e-1, 1.0e-2, 1.0e-3, 2.0, 2.0e-1, 2.0e-2,
+    2.0e-3, 3.0, 3.0e-1, 3.0e-2, 3.0e-3,
+  ]);
+
+  result.idxTheta.forEach((value, idx) => {
+    expect(value).toBeCloseTo(expectedIdxTheta[idx], 4);
+  });
+
+  // Validate cache calculation
+  const expectedCacheOutput = Array.from([
+    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0e-1, 1.0e-2, 1.0e-3, 1.0,
+    1.0e-1, 1.0e-2, 1.0e-3, 2.0, 2.0e-1, 2.0e-2, 2.0e-3, 2.0, 2.0e-1, 2.0e-2,
+    2.0e-3, 3.0, 3.0e-1, 3.0e-2, 3.0e-3, 3.0, 3.0e-1, 3.0e-2, 3.0e-3,
+  ]);
+
+  result.cache.forEach((value, idx) => {
+    expect(value).toBeCloseTo(expectedCacheOutput[idx], 4);
+  });
+
+  const expectedRotatedOutput = Array.from([
+    0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, -0.3968, 0.2286, 0.392, 0.4991,
+    0.4925, 0.7265, 0.804, 0.9005, -0.7614, 0.2331, 0.4819, 0.598, -0.0185,
+    0.8635, 0.9098, 1.0012, -0.5089, 0.2117, 0.5697, 0.6967, -0.7355, 1.0076,
+    1.0175, 1.1021,
+  ]);
+
+  result.outputData.forEach((value, idx) => {
+    expect(value).toBeCloseTo(expectedRotatedOutput[idx], 4);
+  });
+
+  await page.close();
+});
+
+declare global {
+  interface Window {
+    runRotaryEmbeddingTest: () => Promise<{
+      inputShape: number[];
+      inputData: number[];
+      outputShape: number[];
+      outputData: number[];
+      theta: number[];
+      cache: number[];
+      idxTheta: number[];
+    }>;
+  }
+}
diff --git a/tests/integration/tensor_stats.test.ts b/tests/integration/tensor_stats.test.ts
new file mode 100644
index 0000000..a26fb2a
--- /dev/null
+++ b/tests/integration/tensor_stats.test.ts
@@ -0,0 +1,80 @@
+import { Tensor } from "../../src/tensor/tensor.js";
+import { test, expect } from "@playwright/test";
+
+test.describe("Tensor Statistics Operations", () => {
+  test.describe("mean", () => {
+    test("should calculate mean along specified dimensions", async () => {
+      const data = new Float32Array([1, 2, 3, 4, 5, 6]);
+      const tensor = new Tensor(data, [2, 3], false);
+
+      const mean = await tensor.mean([1]);
+      expect(mean.shape).toEqual([2]);
+      expect(Array.from(mean.data)).toEqual([2, 5]); // [mean(1,2,3), mean(4,5,6)]
+    });
+
+    test("should handle single dimension tensors", async () => {
+      const data = new Float32Array([1, 2, 3, 4]);
+      const tensor = new Tensor(data, [4], false);
+
+      const mean = await tensor.mean([0]);
+      expect(mean.shape).toEqual([1]);
+      expect(mean.data[0]).toBeCloseTo(2.5); // mean(1,2,3,4)
+    });
+  });
+
+  test.describe("variance", () => {
+    test("should calculate variance along specified dimensions", async () => {
+      const data = new Float32Array([1, 2, 3, 4, 5, 6]);
+      const tensor = new Tensor(data, [2, 3], false);
+
+      const variance = await tensor.variance([1]);
+      expect(variance.shape).toEqual([2]);
+      // Variance of [1,2,3] and [4,5,6]
+      expect(
+        Array.from(variance.data).map((x) => Number(x.toFixed(2))),
+      ).toEqual([0.67, 0.67]);
+    });
+
+    test("should handle single dimension tensors", async () => {
+      const data = new Float32Array([2, 4, 4, 6]);
+      const tensor = new Tensor(data, [4], false);
+
+      const variance = await tensor.variance([0]);
+      expect(variance.shape).toEqual([1]);
+      expect(variance.data[0]).toBeCloseTo(2); // variance of [2,4,4,6]
+    });
+  });
+
+  test.describe("sqrt", () => {
+    test("should calculate element-wise square root", async () => {
+      const data = new Float32Array([1, 4, 9, 16]);
+      const tensor = new Tensor(data, [4], false);
+
+      const sqrt = await tensor.sqrt();
+      expect(sqrt.shape).toEqual([4]);
+      expect(Array.from(sqrt.data)).toEqual([1, 2, 3, 4]);
+    });
+
+    test("should handle multi-dimensional tensors", async () => {
+      const data = new Float32Array([1, 4, 9, 16, 25, 36]);
+      const tensor = new Tensor(data, [2, 3], false);
+
+      const sqrt = await tensor.sqrt();
+      expect(sqrt.shape).toEqual([2, 3]);
+      expect(Array.from(sqrt.data)).toEqual([1, 2, 3, 4, 5, 6]);
+    });
+  });
+
+  test.describe("combined operations", () => {
+    test("should correctly compute standard deviation using sqrt(variance)", async () => {
+      const data = new Float32Array([2, 4, 4, 6]);
+      const tensor = new Tensor(data, [4], false);
+
+      const variance = await tensor.variance([0]);
+      const stdDev = await variance.sqrt();
+
+      expect(stdDev.shape).toEqual([1]); // The shape should be [1] for a scalar result
+      expect(stdDev.data[0]).toBeCloseTo(Math.sqrt(2)); // The actual value check
+    });
+  });
+});
diff --git a/tests/unit/tensor.test.ts b/tests/unit/tensor.test.ts
index d04f9a8..775a856 100644
--- a/tests/unit/tensor.test.ts
+++ b/tests/unit/tensor.test.ts
@@ -102,4 +102,346 @@ describe("Tensor", () => {
       );
     });
   });
+
+  describe("concat", () => {
+    it("should concatenate 1D tensors along axis 0", async () => {
+      const t1 = new Tensor(new Float32Array([1, 2, 3]), [3]);
+      const t2 = new Tensor(new Float32Array([4, 5, 6]), [3]);
+
+      const result = await t1.concat(t2, 0);
+
+      expect(result.shape).toEqual([6]);
+      expect(Array.from(result.data)).toEqual([1, 2, 3, 4, 5, 6]);
+    });
+
+    it("should concatenate 2D tensors along axis 0", async () => {
+      const t1 = new Tensor(new Float32Array([1, 2, 3, 4]), [2, 2]);
+      const t2 = new Tensor(new Float32Array([5, 6, 7, 8]), [2, 2]);
+
+      const result = await t1.concat(t2, 0);
+
+      expect(result.shape).toEqual([4, 2]);
+      expect(Array.from(result.data)).toEqual([1, 2, 3, 4, 5, 6, 7, 8]);
+    });
+
+    it("should concatenate 2D tensors along axis 1", async () => {
+      const t1 = new Tensor(new Float32Array([1, 2, 3, 4]), [2, 2]);
+      const t2 = new Tensor(new Float32Array([5, 6, 7, 8]), [2, 2]);
+
+      const result = await t1.concat(t2, 1);
+
+      expect(result.shape).toEqual([2, 4]);
+      expect(Array.from(result.data)).toEqual([1, 2, 5, 6, 3, 4, 7, 8]);
+    });
+
+    it("should throw error for invalid axis", async () => {
+      const t1 = new Tensor(new Float32Array([1, 2]), [2]);
+      const t2 = new Tensor(new Float32Array([3, 4]), [2]);
+
+      await expect(t1.concat(t2, 1)).rejects.toThrow("Invalid axis");
+    });
+
+    it("should throw error for shape mismatch", async () => {
+      const t1 = new Tensor(new Float32Array([1, 2]), [2]);
+      const t2 = new Tensor(new Float32Array([3, 4, 5]), [3]);
+
+      await expect(t1.concat(t2, 0)).rejects.toThrow("Shape mismatch");
+    });
+
+    it("should preserve requires_grad", async () => {
+      const t1 = new Tensor(new Float32Array([1, 2]), [2], true);
+      const t2 = new Tensor(new Float32Array([3, 4]), [2], false);
+
+      const result = await t1.concat(t2, 0);
+
+      expect(result.requires_grad).toBe(true);
+    });
+  });
+  describe("slice", () => {
+    it("should slice a 1D tensor with basic indexing", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4, 5]), [5]);
+      const result = await tensor.slice([1, 4]);
+
+      expect(result.shape).toEqual([3]);
+      expect(Array.from(result.data)).toEqual([2, 3, 4]);
+    });
+
+    it("should handle full slice with ':'", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4]), [4]);
+      const result = await tensor.slice(":");
+
+      expect(result.shape).toEqual([4]);
+      expect(Array.from(result.data)).toEqual([1, 2, 3, 4]);
+    });
+
+    it("should slice with step size", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4, 5, 6]), [6]);
+      const result = await tensor.slice([null, null, 2]);
+
+      expect(result.shape).toEqual([3]);
+      expect(Array.from(result.data)).toEqual([1, 3, 5]);
+    });
+
+    it("should handle negative indices", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4, 5]), [5]);
+      const result = await tensor.slice([-3, -1]);
+
+      expect(result.shape).toEqual([2]);
+      expect(Array.from(result.data)).toEqual([3, 4]);
+    });
+
+    it("should slice a 2D tensor along both dimensions", async () => {
+      const tensor = new Tensor(
+        new Float32Array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        [3, 3],
+      );
+      const result = await tensor.slice([0, 2], [1, 3]);
+
+      expect(result.shape).toEqual([2, 2]);
+      expect(Array.from(result.data)).toEqual([2, 3, 5, 6]);
+    });
+
+    it("should handle reverse slicing with negative step", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4, 5]), [5]);
+      const result = await tensor.slice([null, null, -1]);
+
+      expect(result.shape).toEqual([5]);
+      expect(Array.from(result.data)).toEqual([5, 4, 3, 2, 1]);
+    });
+
+    it("should preserve requires_grad", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4]), [4], true);
+      const result = await tensor.slice([1, 3]);
+
+      expect(result.requires_grad).toBe(true);
+    });
+
+    it("should handle mixed slicing with numbers and slices", async () => {
+      const tensor = new Tensor(
+        new Float32Array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        [3, 3],
+      );
+      const result = await tensor.slice(1, ":");
+
+      expect(result.shape).toEqual([3]);
+      expect(Array.from(result.data)).toEqual([4, 5, 6]);
+    });
+
+    it("should slice the first half of a dimension", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4, 5, 6]), [6]);
+      const result = await tensor.slice([0, 3]);
+
+      expect(result.shape).toEqual([3]);
+      expect(Array.from(result.data)).toEqual([1, 2, 3]);
+    });
+
+    it("should handle overlapping step slices", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4, 5]), [5]);
+      const result = await tensor.slice([0, 4, 2]);
+
+      expect(result.shape).toEqual([2]);
+      expect(Array.from(result.data)).toEqual([1, 3]);
+    });
+
+    it("should throw error for invalid dimensions", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3]), [3]);
+      await expect(tensor.slice(":", ":")).rejects.toThrow(
+        "Too many indices for tensor",
+      );
+    });
+
+    it("should handle 3D tensor slicing", async () => {
+      const tensor = new Tensor(
+        new Float32Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
+        [3, 2, 2],
+      );
+      const result = await tensor.slice(":", [0, 2], ":");
+
+      expect(result.shape).toEqual([3, 2, 2]);
+      expect(Array.from(result.data)).toEqual([
+        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+      ]);
+    });
+  });
+  describe("pow", () => {
+    it("should correctly square a tensor (power of 2)", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4]), [2, 2]);
+      const [result] = await tensor.pow(2);
+
+      expect(result.shape).toEqual([2, 2]);
+      expect(Array.from(result.data)).toEqual([1, 4, 9, 16]);
+    });
+
+    it("should correctly calculate square root (power of 0.5)", async () => {
+      const tensor = new Tensor(new Float32Array([1, 4, 9, 16]), [2, 2]);
+      const [result] = await tensor.pow(0.5);
+
+      expect(result.shape).toEqual([2, 2]);
+      expect(Array.from(result.data)).toEqual([1, 2, 3, 4]);
+    });
+
+    it("should handle negative numbers with even powers", async () => {
+      const tensor = new Tensor(new Float32Array([-2, -3, 2, 3]), [2, 2]);
+      const [result] = await tensor.pow(2);
+
+      expect(result.shape).toEqual([2, 2]);
+      expect(Array.from(result.data)).toEqual([4, 9, 4, 9]);
+    });
+
+    it("should preserve requires_grad", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4]), [2, 2], true);
+      const [result] = await tensor.pow(2);
+
+      expect(result.requires_grad).toBe(true);
+    });
+
+    it("should handle power of 1 (identity)", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4]), [2, 2]);
+      const [result] = await tensor.pow(1);
+
+      expect(Array.from(result.data)).toEqual([1, 2, 3, 4]);
+    });
+
+    it("should handle power of 0 (all ones)", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4]), [2, 2]);
+      const [result] = await tensor.pow(0);
+
+      expect(Array.from(result.data)).toEqual([1, 1, 1, 1]);
+    });
+
+    it("should maintain shape for 1D tensors", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4]), [4]);
+      const [result] = await tensor.pow(2);
+
+      expect(result.shape).toEqual([4]);
+      expect(Array.from(result.data)).toEqual([1, 4, 9, 16]);
+    });
+  });
+  describe("sum", () => {
+    it("should sum 1D tensor correctly", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4]), [4]);
+      const result = await tensor.sum([0]);
+
+      expect(result.shape).toEqual([1]);
+      expect(Array.from(result.data)[0]).toBe(10); // 1 + 2 + 3 + 4
+    });
+
+    it("should sum 2D tensor along first dimension", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4, 5, 6]), [2, 3]);
+      const result = await tensor.sum([0]);
+
+      expect(result.shape).toEqual([3]);
+      expect(Array.from(result.data)).toEqual([5, 7, 9]); // [1+4, 2+5, 3+6]
+    });
+
+    it("should sum 2D tensor along second dimension", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4, 5, 6]), [2, 3]);
+      const result = await tensor.sum([1]);
+
+      expect(result.shape).toEqual([2]);
+      expect(Array.from(result.data)).toEqual([6, 15]); // [1+2+3, 4+5+6]
+    });
+
+    it("should preserve requires_grad", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4]), [4], true);
+      const result = await tensor.sum([0]);
+
+      expect(result.requires_grad).toBe(true);
+    });
+
+    it("should handle tensor with all zeros", async () => {
+      const tensor = new Tensor(new Float32Array([0, 0, 0, 0]), [2, 2]);
+      const result = await tensor.sum([1]);
+
+      expect(result.shape).toEqual([2]);
+      expect(Array.from(result.data)).toEqual([0, 0]);
+    });
+
+    it("should handle single element tensor", async () => {
+      const tensor = new Tensor(new Float32Array([5]), [1]);
+      const result = await tensor.sum([0]);
+
+      expect(result.shape).toEqual([1]);
+      expect(Array.from(result.data)[0]).toBe(5);
+    });
+
+    it("should handle summing along multiple dimensions", async () => {
+      const tensor = new Tensor(new Float32Array([1, 2, 3, 4, 5, 6]), [2, 3]);
+      const result = await tensor.sum([0, 1]);
+
+      expect(result.shape).toEqual([1]);
+      expect(Array.from(result.data)[0]).toBe(21); // sum of all elements
+    });
+
+    it("should maintain correct shape after summing", async () => {
+      const tensor = new Tensor(
+        new Float32Array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        [3, 3],
+      );
+      const result = await tensor.sum([0]);
+
+      expect(result.shape).toEqual([3]);
+      expect(Array.from(result.data)).toEqual([12, 15, 18]); // column sums
+    });
+  });
+
+  it("should default requires_grad to false", () => {
+    const data = new Float32Array([1, 2, 3, 4]);
+    const shape = [2, 2];
+    const tensor = new Tensor(data, shape);
+
+    expect(tensor.requires_grad).toBe(false);
+  });
+
+  it("should throw an error if the number of elements in data and shape are different", () => {
+    const data = new Float32Array([1, 2, 3]);
+    const shape = [2, 2];
+
+    expect(() => new Tensor(data, shape)).toThrow("Incompatible shapes");
+  });
+
+  describe("norm", () => {
+    it("should calculate Euclidean norm along default dimension", async () => {
+      const tensor = new Tensor(new Float32Array([3, 4]), [2]);
+      const [result] = await tensor.norm();
+
+      // Should be sqrt(3^2 + 4^2) = 5
+      expect(result.shape).toEqual([1]);
+      expect(Array.from(result.data)[0]).toBeCloseTo(5);
+    });
+
+    it("should handle 2D tensor Euclidean norm", async () => {
+      const tensor = new Tensor(new Float32Array([3, 4, 6, 8]), [2, 2]);
+      const [result] = await tensor.norm();
+
+      console.log("result:", result.data.toString());
+
+      // For dim=0: sqrt(3^2 + 6^2) and sqrt(4^2 + 8^2)
+      expect(result.shape).toEqual([2]);
+      expect(Array.from(result.data)[0]).toBeCloseTo(6.708203932499369); // sqrt(45)
+      expect(Array.from(result.data)[1]).toBeCloseTo(8.94427190999916); // sqrt(80)
+    });
+
+    it("should preserve requires_grad", async () => {
+      const tensor = new Tensor(new Float32Array([3, 4]), [2], true);
+      const [result] = await tensor.norm();
+
+      expect(result.requires_grad).toBe(true);
+    });
+
+    it("should handle tensor with all zeros", async () => {
+      const tensor = new Tensor(new Float32Array([0, 0, 0, 0]), [2, 2]);
+      const [result] = await tensor.norm();
+
+      expect(Array.from(result.data)).toEqual([0, 0]);
+    });
+
+    it("should handle 1D tensor with single element", async () => {
+      const tensor = new Tensor(new Float32Array([5]), [1]);
+      const [result] = await tensor.norm();
+
+      expect(result.shape).toEqual([1]);
+      expect(Array.from(result.data)[0]).toBe(5);
+    });
+  });
 });