Skip to content

Commit

Permalink
Add RNA Seq normalization methods (#136)
Browse files Browse the repository at this point in the history
* add empty test

* rename file ending

* tpkm & rpkm + tests

* with record type

* input as record types

* Added XML tags

* modified axis labelling
  • Loading branch information
ScheidTo authored May 14, 2024
1 parent b98e88b commit 32d20c3
Show file tree
Hide file tree
Showing 7 changed files with 412 additions and 0 deletions.
1 change: 1 addition & 0 deletions BioFSharp
Submodule BioFSharp added at 92be7c
293 changes: 293 additions & 0 deletions docs/rnaseq_normalization.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/BioFSharp.Stats/BioFSharp.Stats.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,12 @@
</PropertyGroup>
<ItemGroup>
<Compile Include="OntologyEnrichment.fs" />
<Compile Include="RNASeq.fs" />
<Compile Include="SurprisalAnalysisEmpiricalPermutationTest.fs" />

<None Include="Playground\OntologyEnrichment.fsx" />
<None Include="Playground\SurprisalAnalysisEmpiricalPermutationTest.fsx" />
<None Include="Playground\RNASeq.fsx" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="FSharpAux" Version="2.0.0" />
Expand Down
5 changes: 5 additions & 0 deletions src/BioFSharp.Stats/Playground/RNASeq.fsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#load "../RNASeq.fs"
open BioFSharp
open BioFSharp.Stats
open BioFSharp.Stats.RNASeq

71 changes: 71 additions & 0 deletions src/BioFSharp.Stats/RNASeq.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
namespace BioFSharp.Stats

open System
open System.Collections.Generic

/// Contains types and functions needed for RNA-Seq normalization
module RNASeq =
/// Input type for RNA-Seq normalization
type RNASeqInput = {
GeneID : string
GeneLength : float
GeneCount : float
} with static member Create id gl gc = {GeneID=id;GeneLength=gl;GeneCount=gc}
type NormalizationMethod =
| RPKM
| TPM
/// Type with GeneID, normalized data and method of normalization
type NormalizedCounts = {
GeneID : string
NormalizedCount : float
NormalizationMethod: NormalizationMethod
} with static member Create id nc nm = {GeneID=id;NormalizedCount=nc;NormalizationMethod=nm}
/// calculates Reads Per Million
let private calcRPM sumOfAllReadsPerMil counts =
(counts |> float) / sumOfAllReadsPerMil
/// calculates RPKM
let private calcRPKM geneLength rpm =
(float rpm) / ((float geneLength) / 1000.)
///Performs RPKM normalization
let private rpkmsOf (geneIDs:seq<string>) (length:seq<float>) (counts:seq<float>) =
let sumOfAllReads =
counts
|> Seq.sum
let sumOfAllReadsPerMil =
sumOfAllReads / 1000000.
let rpms =
Seq.map (fun counts -> calcRPM sumOfAllReadsPerMil counts) counts
let rpkms =
let rpkm =
Seq.zip length rpms
|> Seq.map (fun (length, rpm) -> calcRPKM length rpm)
rpkm
let rpkmResult =
Seq.map2 (fun ids counts -> {GeneID=ids; NormalizedCount=counts; NormalizationMethod=RPKM}) geneIDs rpkms
rpkmResult
/// Returns RPKM normalized data
let rpkms (idLengthAndCounts:seq<RNASeqInput>) =
rpkmsOf (idLengthAndCounts |> Seq.map (fun x -> x.GeneID)) (idLengthAndCounts |> Seq.map (fun x -> x.GeneLength)) (idLengthAndCounts |> Seq.map (fun x -> x.GeneCount))
/// Performs TPM normalization
let private tpmsOf (idLengthAndCounts:seq<RNASeqInput>) =
let rpk =
idLengthAndCounts
|> Seq.map (fun idLengthAndCounts -> idLengthAndCounts.GeneCount/idLengthAndCounts.GeneLength/1000.)
let sumOfAllReads =
rpk
|> Seq.sum
let sumOfAllReadsPerMil =
sumOfAllReads / 1000000.
let tpms =
rpk
|> Seq.map (fun rpks -> rpks/sumOfAllReadsPerMil)
let geneID =
idLengthAndCounts
|> Seq.map (fun idLengthAndCounts -> idLengthAndCounts.GeneID)
let tpmResult =
Seq.map2 (fun ids counts -> {GeneID=ids; NormalizedCount=counts; NormalizationMethod=TPM}) geneID tpms
tpmResult
/// Returns TPM normalized data
let tpms (idLengthAndCounts:seq<RNASeqInput>) =
tpmsOf idLengthAndCounts

37 changes: 37 additions & 0 deletions tests/BioFSharp.Tests/BioFSharp.Stats/RNASeqTests.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
module RNASeqTests

open BioFSharp.Stats
open Expecto


let testSeq = seq { for i in 1. .. 2. -> ("stringtest"+ i.ToString(),(i,i))}
let testgeneID = seq { "stringtest1"; "stringtest2"}
let testLength = seq {1.; 2.}
let testCount = seq {1.;2.}
let testInSeq = Seq.map3 (fun id gl gc -> RNASeq.RNASeqInput.Create id gl gc) testgeneID testLength testCount

let resultRPKM= seq {("stringtest1", 333333333.3333333); ("stringtest2",333333333.3333333)}
let resultTPM= seq {("stringtest1", 500000.); ("stringtest2", 500000.)}
let RPKMres = Seq.map (fun (id,rpkm) -> RNASeq.NormalizedCounts.Create id rpkm RNASeq.NormalizationMethod.RPKM) resultRPKM
let TPMres = Seq.map (fun (id,tpm) -> RNASeq.NormalizedCounts.Create id tpm RNASeq.NormalizationMethod.TPM) resultTPM
[<Tests>]
let RNASeqTests =

testList "RNASeqTests" [
testCase "RPKM" (fun _ ->
Expect.sequenceEqual
(RNASeq.rpkms testInSeq)
//|> Array.ofSeq)
(RPKMres)
//|> Array.ofSeq)
"RPKM did not return correct Sequence"
)
testCase "TPM" (fun _ ->
Expect.sequenceEqual
(RNASeq.tpms testInSeq)
//|> Array.ofSeq)
(TPMres)
//|> Array.ofSeq)
"TPM did not return correct Sequence"
)
]
2 changes: 2 additions & 0 deletions tests/BioFSharp.Tests/BioFSharp.Tests.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
<Compile Include="BioFSharp\BioCollections.fs" />
<Compile Include="BioFSharp\BioItemTests.fs" />
<Compile Include="BioFSharp\PhylTreeTests.fs" />
<Compile Include="BioFSharp.Stats\RNASeqTests.fs" />
<Compile Include="Main.fs" />
</ItemGroup>

Expand All @@ -35,5 +36,6 @@
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0-release-24177-07" />
<ProjectReference Include="..\..\src\BioFSharp\BioFSharp.fsproj" />
<ProjectReference Include="..\..\src\BioFSharp.IO\BioFSharp.IO.fsproj" />
<ProjectReference Include="..\..\src\BioFSharp.Stats\BioFSharp.Stats.fsproj" />
</ItemGroup>
</Project>

0 comments on commit 32d20c3

Please sign in to comment.