Skip to content

Commit

Permalink
reduce protocol splitting complexity of assay.xlsx reader
Browse files Browse the repository at this point in the history
  • Loading branch information
HLWeil committed Sep 2, 2021
1 parent 700bf6d commit 81a8130
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 209 deletions.
7 changes: 6 additions & 1 deletion build.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,12 @@ module TestTasks =
Shell.cleanDirs (!! "tests/**/**/TestResult")
}

let runTests = BuildTask.create "RunTests" [clean; cleanTestResults; build; copyBinaries] {
let cleanTestBinaries =
BuildTask.create "cleanTestBinaries" [] {
Shell.cleanDirs (!! "tests/**/bin")
}

let runTests = BuildTask.create "RunTests" [clean; cleanTestBinaries; cleanTestResults; build; copyBinaries] {
let standardParams = Fake.DotNet.MSBuild.CliArguments.Create ()
Fake.DotNet.DotNet.test(fun testParams ->
{
Expand Down
68 changes: 0 additions & 68 deletions src/ISADotNet.XLSX/AssayFile/AnnotationTable.fs
Original file line number Diff line number Diff line change
Expand Up @@ -5,75 +5,7 @@ open ISADotNet

/// Functions for parsing an annotation table to the described processes
module AnnotationTable =

/// Splits the headers of an annotation table into parts, so that each part has at most one input and one output column (Source Name, Sample Name)
let splitBySamples (headers : seq<string>) =
let isSample header = AnnotationColumn.tryParseSampleName header |> Option.isSome
let isSource header = AnnotationColumn.tryParseSourceName header |> Option.isSome

match Seq.filter isSource headers |> Seq.length, Seq.filter isSample headers |> Seq.length with
| 1,1 ->
Seq.filter isSample headers
|> Seq.append (Seq.filter (fun s -> (isSample s || isSource s) |> not) headers)
|> Seq.append (Seq.filter isSource headers)
|> Seq.singleton
| 0,1 -> Seq.append (Seq.filter (isSample>>not) headers) (Seq.filter isSample headers) |> Seq.singleton
| 0,2 when Seq.head headers |> isSample && Seq.last headers |> isSample -> headers |> Seq.singleton
| _ -> Seq.groupWhen true (fun header -> isSample header || isSource header) headers

/// Splits the parts into protocols according to the headers given together with the named protocols. Assins the input and output column to each resulting protocol
let splitByNamedProtocols (namedProtocols : (Protocol * seq<string>) seq) (headers : seq<string>) =
let sortAgainst =
let m = headers |> Seq.mapi (fun i x -> x,i) |> Map.ofSeq
fun hs -> hs |> Seq.sortBy (fun v -> m.[v])
let isSample (header:string) = AnnotationColumn.isSample header || AnnotationColumn.isSource header

let rec loop (protocolOverlaps : (Protocol * seq<string>) list) (namedProtocols : (Protocol * Set<string>) list) (remainingHeaders : Set<string>) =
match namedProtocols with
| _ when remainingHeaders.IsEmpty ->
protocolOverlaps
| (p,hs)::l ->
if Set.isSubset hs remainingHeaders then
loop ((p,Set.toSeq hs)::protocolOverlaps) l (Set.difference remainingHeaders hs)
else
loop protocolOverlaps l remainingHeaders
| [] ->
(Protocol.empty ,remainingHeaders |> Set.toSeq)::protocolOverlaps

let sampleColumns,otherColumns = headers |> Seq.filter (isSample) |> Seq.toList,headers |> Seq.filter (isSample>>not)

let protocolOverlaps =
loop [] (namedProtocols |> Seq.map (fun (p,hs) -> p,hs |> Set.ofSeq) |> List.ofSeq) (otherColumns |> Set.ofSeq)
|> Seq.map (fun (p,hs) -> p, sortAgainst hs)

match sampleColumns with
| [] -> protocolOverlaps
| [s] -> protocolOverlaps |> Seq.map (fun (p,hs) -> p,Seq.append [s] hs)
| [s1;s2] -> protocolOverlaps |> Seq.map (fun (p,hs) -> p,Seq.append (Seq.append [s1] hs) [s2])
| s -> protocolOverlaps |> Seq.map (fun (p,hs) -> p,Seq.append hs s)

/// Name unnamed protocols with the given sheetName. If there is more than one unnamed protocol, additionally add an index
let indexProtocolsBySheetName (sheetName:string) (protocols : (Protocol * seq<string>) seq) =
let unnamedProtocolCount = protocols |> Seq.filter (fun (p,_) -> p.Name.IsNone) |> Seq.length
match unnamedProtocolCount with
| 0 -> protocols
| 1 ->
protocols
|> Seq.map (fun (p,hs) ->
if p.Name.IsNone then
{p with Name = Some sheetName},hs
else p,hs
)
| _ ->
let mutable i = 0
protocols
|> Seq.map (fun (p,hs) ->
if p.Name.IsNone then
let name = sprintf "%s_%i" sheetName i
i <- i + 1
{p with Name = Some name},hs
else p,hs
)

/// Returns the protocol described by the headers and a function for parsing the values of the matrix to the processes of this protocol
let getProcessGetter protocolMetaData (nodes : seq<seq<string>>) =
Expand Down
44 changes: 11 additions & 33 deletions src/ISADotNet.XLSX/AssayFile/Assay.fs
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,16 @@ module AssayFile =
|> MetaData.init metadataSheetName
|> Spreadsheet.close

let fromSparseMatrix (processNameRoot:string) namedProtocols matrixHeaders (matrixLength:int) (sparseMatrix : Dictionary<string*int,string>) =
AnnotationTable.splitBySamples matrixHeaders
|> Seq.collect (AnnotationTable.splitByNamedProtocols namedProtocols)
|> AnnotationTable.indexProtocolsBySheetName processNameRoot
|> Seq.map (fun (protocolMetaData,headers) ->
let characteristic,factors,protocol,processGetter =
AnnotationNode.splitIntoNodes headers
|> AnnotationTable.getProcessGetter protocolMetaData
characteristic,factors,protocol,
let fromSparseMatrix (processNameRoot:string) matrixHeaders (matrixLength:int) (sparseMatrix : Dictionary<string*int,string>) =
let characteristic,factors,protocol,processGetter =
AnnotationNode.splitIntoNodes matrixHeaders
|> AnnotationTable.getProcessGetter ({Protocol.empty with Name = Some processNameRoot})
characteristic,factors,protocol,

Seq.init matrixLength (processGetter sparseMatrix)
|> AnnotationTable.mergeIdenticalProcesses
|> AnnotationTable.indexRelatedProcessesByProtocolName
)
Seq.init matrixLength (processGetter sparseMatrix)
|> AnnotationTable.mergeIdenticalProcesses
|> AnnotationTable.indexRelatedProcessesByProtocolName


/// Parses the assay file
let fromFile (path:string) =
Expand All @@ -53,14 +49,7 @@ module AssayFile =
|> Seq.map (Row.mapCells (Cell.includeSharedStringValue sst.Value))
|> MetaData.fromRows
)
|> Option.defaultValue (None,[])

// Get the named protocol templates from the custom xml
let protocolTemplates =
Spreadsheet.getWorkbookPart doc
|> SwateTable.readSwateTables
|> Seq.choose (fun st -> st.ProtocolGroup |> Option.map (fun ps -> st.Worksheet,ps.Protocols))
|> Map.ofSeq
|> Option.defaultValue (None,[])

let sheetNames =
Spreadsheet.getWorkbookPart doc
Expand All @@ -82,19 +71,8 @@ module AssayFile =
let length =
Table.getArea table
|> fun area -> Table.Area.lowerBoundary area - Table.Area.upperBoundary area |> int
let namedProtocols =
Map.tryFind sheetName protocolTemplates
|> Option.defaultValue Seq.empty
|> Seq.choose (fun p ->
SwateTable.trySelectProtocolheaders p headers
|> Option.map (fun nps ->
Protocol.create None (Some p.Id) None None None None None None None,
nps
)
)
|> Seq.toList

fromSparseMatrix sheetName namedProtocols headers length m
Seq.singleton (fromSparseMatrix sheetName headers length m)

| None -> Seq.empty
| None -> Seq.empty
Expand Down
Loading

0 comments on commit 81a8130

Please sign in to comment.