Sebastien Mondet (@smondet).
C◦mp◦se :: Conference, Thursday, May 18, 2017.
See also the PDF version, and the PDF handout.
Seb: Software Engineering / Dev Ops at the Hammer Lab.
Was here 2 years ago to present:
Now
Tyxml_js
+ react
WebUIhammerlab/epidisco/
Cool experiment: GADT-based, very high-level pipeline EDSL.
Cool experiment: add tools / tool-kinds:
Kept growing, became the default…
type _ t = | Fastq_gz: File.t -> fastq_gz t | Fastq: File.t -> fastq t | Bam_sample: string * bam -> bam t | Bam_to_fastq: [ `Single | `Paired ] * bam t -> fastq_sample t | Paired_end_sample: fastq_sample_info * fastq t * fastq t -> fastq_sample t | Single_end_sample: fastq_sample_info * fastq t -> fastq_sample t | Gunzip_concat: fastq_gz t list -> fastq t | Concat_text: fastq t list -> fastq t | Star: Star.Configuration.Align.t * fastq_sample t -> bam t | Hisat: Hisat.Configuration.t * fastq_sample t -> bam t | Stringtie: Stringtie.Configuration.t * bam t -> gtf t | Bwa: Bwa.Configuration.Aln.t * fastq_sample t -> bam t | Bwa_mem: Bwa.Configuration.Mem.t * fastq_sample t -> bam t | Mosaik: fastq_sample t -> bam t | Gatk_indel_realigner: Gatk.Configuration.indel_realigner * bam t -> bam t | Picard_mark_duplicates: Picard.Mark_duplicates_settings.t * bam t -> bam t | Gatk_bqsr: (Gatk.Configuration.bqsr * bam t) -> bam t | Bam_pair: bam t * bam t -> bam_pair t | Somatic_variant_caller: somatic Variant_caller.t * bam_pair t -> vcf t | Germline_variant_caller: germline Variant_caller.t * bam t -> vcf t | Seq2HLA: fastq_sample t -> seq2hla_hla_types t | Optitype: ([`DNA | `RNA] * fastq_sample t) -> optitype_hla_types t | With_metadata: metadata_spec * 'a t -> 'a t
let crazy_example ~normal_fastqs ~tumor_fastqs ~dataset = let open Pipeline.Construct in let normal = input_fastq ~dataset normal_fastqs in let tumor = input_fastq ~dataset tumor_fastqs in let bam_pair ?gap_open_penalty ?gap_extension_penalty () = let normal = bwa ?gap_open_penalty ?gap_extension_penalty normal |> gatk_indel_realigner |> picard_mark_duplicates |> gatk_bqsr in let tumor = bwa ?gap_open_penalty ?gap_extension_penalty tumor |> gatk_indel_realigner |> picard_mark_duplicates in pair ~normal ~tumor in let bam_pairs = [ bam_pair (); bam_pair ~gap_open_penalty:10 ~gap_extension_penalty:7 (); ] in let vcfs = List.concat_map bam_pairs ~f:(fun bam_pair -> [ mutect bam_pair; somaticsniper bam_pair; somaticsniper ~prior_probability:0.001 ~theta:0.95 bam_pair; varscan_somatic bam_pair; strelka ~configuration:Strelka.Configuration.exome_default bam_pair; ]) in vcfs
Fancy but not that practical:
Pipeline.t
is getting too bigcompile_aligner_step
is about 170 lines of pattern-matchinglambda
/apply
, list functions, etc.We want what we already have + users of the library to be able to:
Tried a few experiments:
“We trivially and elegantly solved that problem 20 years ago!”
First:
.tar.gz
and draft paper; then
it got to
PEPM'16 →
DOI:2847538.2847542
).bitbucket.org/knih/quel
.@pveber pointed us to Oleg's course:
⇒ Follow the course; with QueΛ's help; in a Biokepi-like setting.
And it's more powerful:
lambda
/apply
, list and pair functions, …And keeps growing:
$ grep 'val ' src/pipeline_edsl/semantics.ml | wc -l
56
Now tutorial mode:
Type Constraints + Existential Types:
type _ t = | Int: int -> int t | True: bool t | False: bool t | Equal: 'a t * 'a t -> bool t let rec eval: type v. v t -> v = function | Int i -> i | True -> true | False -> false | Equal (a, b) -> (=) (eval a) (eval b) let () = assert (eval (Int 42) = 42) let () = assert (eval (Equal (True, (Equal (Int 42, Int 42)))) = true)
type pack = Deal_with_it: 'a * 'a how_to -> pack
hammerlab/genspio
/src/proto/alpha/script_typed_ir.ml
Printf.printf
.Eliom_service.create
.Type Constraints + Existential Types, using module types and functors:
module type Symantics = sig type 'a repr val int: int -> int repr val t: bool repr val f: bool repr val equal: 'a repr -> 'a repr -> bool repr end module Eval_ocaml : Symantics with type 'a repr = 'a = struct type 'a repr = 'a let int i = i let t = true let f = false let equal a b = (a = b) (* Cheating a bit *) end module Examples (EDSL: Symantics) = struct let ex1 = EDSL.int 42 let ex2 = EDSL.(equal t (equal (int 42) (int 42))) end let () = let module Compiled_examples = Examples(Eval_ocaml) in assert (Compiled_examples.ex1 = 42); assert (Compiled_examples.ex2 = true); ()
In OCaml:
module type Semantics
functor: Semantics -> whatever
module implementing Semantics
functor: Semantics -> Semantics
More jargon: “observations” are useful artifacts of optimization passes:
module type Symantics = sig type 'a repr val int: int -> int repr val t: bool repr val f: bool repr val equal: 'a repr -> 'a repr -> bool repr type 'a observation val observe: (unit -> 'a repr) -> 'a observation end
module Eval_string : Symantics with type 'a repr = string and type 'a observation = string = struct type 'a repr = string let int = string_of_int let t = "True" let f = "False" let equal a b = Printf.sprintf "(%s = %s)" a b type 'a observation = string let observe f = f () end module More_examples (EDSL: Symantics) = struct let ex1 = let open EDSL in observe (fun () -> int 42) let ex2 = let open EDSL in observe (fun () -> equal (equal t t) (equal (int 42) (int 43)) ) end let () = let module Compiled_examples = More_examples(Eval_string) in Printf.printf "Ex1: %s\nEx2: %s\n%!" Compiled_examples.ex1 Compiled_examples.ex2; ()
Ex1: 42
Ex2: ((True = True) = (42 = 43))
We can do some rewriting with functors:
module True_equal_true_true (Input: Symantics) : Symantics with type 'a observation = 'a Input.observation = struct include Input let t = Input.(equal t t) end let () = let module Compiled_examples = More_examples(True_equal_true_true(Eval_string)) in Printf.printf "Ex1: %s\nEx2: %s\n%!" Compiled_examples.ex1 Compiled_examples.ex2; ()
Ex1: 42
Ex2: (((True = True) = (True = True)) = (42 = 43))
(this works without the 'a observation
thing …)
For more complex/interesting transformations,
what we really want is to “match term with
”:
type _ t = | Int: int -> int t | True: bool t | False: bool t | Equal: 'a t * 'a t -> bool t let rec transform_equal_true_true : type v. v t -> v t = function | Int i -> Int i | True -> True | False -> False | Equal (True, True) -> True (* Optimization Pass ! *) | Equal (a, b) -> Equal (transform_equal_true_true a, transform_equal_true_true b) let () = assert ( transform_equal_true_true (Equal (False, (Equal (True, True)))) = (Equal (False, True)) )
Some type-hackery later … A Generic Extensible Optimization Pass Generator.
module type Transformation_base = sig type 'a from type 'a term val fwd : 'a from -> 'a term (* reflection *) val bwd : 'a term -> 'a from (* reification *) end module Generic_optimizer (X: Transformation_base) (Input: Symantics with type 'a repr = 'a X.from) : Symantics with type 'a repr = 'a X.term and type 'a observation = 'a Input.observation = struct open X type 'a repr = 'a term let int i = fwd (Input.int i) let t = fwd Input.t let f = fwd Input.f let equal a b = fwd (Input.equal (bwd a) (bwd b)) type 'a observation = 'a Input.observation (* Here we “get out” ! *) let observe f = Input.observe (fun () -> bwd (f ())) end
So we want to do | Equal (True, True) -> True
:
module True_true (Input: Symantics) = struct module Transformation = struct type 'a from = 'a Input.repr type 'a term = | Unknown: 'a from -> 'a term | Equal: 'a term * 'a term -> bool term | True: bool term let fwd x = Unknown x let rec bwd : type a. a term -> a from = function | Unknown x -> x | Equal (True, True) -> Input.t | Equal (a, b) -> Input.equal (bwd a) (bwd b) | True -> Input.t end module Language_delta = struct let equal a b = Transformation.Equal (a, b) let t = Transformation.True end include Generic_optimizer(Transformation)(Input) include Language_delta end
Still just a functor to apply “in the chain:”
let () = let module Compiled = More_examples(Eval_string) in let module Optimized = More_examples(True_true(Eval_string)) in Printf.printf "Compiled: %s\nOptimized: %s\n%!" Compiled.ex2 Optimized.ex2
Success!
Compiled: ((True = True) = (42 = 43))
Optimized: (True = (42 = 43))
Some include
, and module sub-typing magic:
module type Symantics_with_lambdas = sig include Symantics (** Lambda abstraction *) val lambda : ('a repr -> 'b repr) -> ('a -> 'b) repr (** Application *) val apply : ('a -> 'b) repr -> 'a repr -> 'b repr end module Eval_string_with_lambdas : Symantics_with_lambdas with type 'a repr = string and type 'a observation = string = struct include Eval_string open Printf let lambda f = let var = sprintf "x%d" (Random.int 1000) in sprintf "(λ %s → %s)" var (f var) let apply f x = sprintf "(%s %s)" f x end
module Example_with_lambdas (EDSL : Symantics_with_lambdas) = struct open EDSL let l1 = lambda (fun x -> equal x t) let ex1 = observe (fun () -> l1) let ex2 = observe (fun () -> apply l1 (equal t t)) (* Of course still type checked: let ex2 = observe (fun () -> apply l1 (int 42)) Error: This expression has type int repr but an expression was expected of type bool repr Type int is not compatible with type bool *) end let () = let module Compiled = Example_with_lambdas(Eval_string_with_lambdas) in Printf.printf "Ex1: %s\nEx2: %s\n%!" Compiled.ex1 Compiled.ex2
Ex1: (λ x370 → (x370 = True))
Ex2: ((λ x370 → (x370 = True)) (True = True))
Soooo meta:
module Generic_optimizer_with_lambdas (X: Transformation_base) (Input: Symantics_with_lambdas with type 'a repr = 'a X.from) : Symantics_with_lambdas with type 'a repr = 'a X.term and type 'a observation = 'a Input.observation = struct open X include Generic_optimizer(X)(Input) let lambda f = fwd (Input.lambda (fun x -> bwd (f (fwd x)))) let apply e1 e2 = fwd (Input.apply (bwd e1) (bwd e2)) end
True_true
does not touch the new stuff:
module True_true_with_lambdas (Input: Symantics_with_lambdas) = struct module Previous_true_true = True_true(Input) include Generic_optimizer_with_lambdas(Previous_true_true.Transformation)(Input) include Previous_true_true.Language_delta end let () = let module Compiled = Example_with_lambdas(Eval_string_with_lambdas) in let module Optimized = Example_with_lambdas(True_true_with_lambdas(Eval_string_with_lambdas)) in Printf.printf "Ex2 normal: %s\nEx2 optimized: %s\n%!" Compiled.ex2 Optimized.ex2
Ex2 normal: ((λ x20 → (x20 = True)) (True = True))
Ex2 optimized: ((λ x921 → (x921 = True)) True)
Fully replaced the GADT-based EDSL:
From PR #236
:
Big (family of) pipeline(s) that drive a clinical trial and other people's
analyses:
hammerlab/epidisco/
Cf. output to dot
-graphs:
We actually do extend the EDSL:
Minor issues:
#429
.include
.list_map
, pair_first
, pair_second
, …Questions?