Attempt to be robust to not having an alignment in our RunXml file
[htsworkflow.git] / docs / conv_caltech_v0.1_to_htsw.py
index 10f4e35862d181a19bf9303e573b7932d91d625f..e69d9faaddb8183090b949bdf24a4e8046fbd887 100644 (file)
@@ -11,6 +11,15 @@ def main(cmdline=None):
     conn = sqlite3.connect(dest)
     c = conn.cursor()
     c.execute('drop table fctracker_elandresult');
+    c.execute('''CREATE TABLE "experiments_clusterstation" (
+      "id" integer NOT NULL PRIMARY KEY,
+      "name" varchar(50) NOT NULL UNIQUE);''')
+    c.execute('''INSERT INTO experiments_clusterstation (name) values ("station");''')
+    c.execute('''CREATE TABLE "experiments_sequencer" (
+      "id" integer NOT NULL PRIMARY KEY,
+      "name" varchar(50) NOT NULL UNIQUE);''')
+    c.execute('''INSERT INTO experiments_sequencer (name) values ("HWI-EAS229");''')
+
     c.execute('''CREATE TABLE "experiments_flowcell" (
     "id" integer NOT NULL PRIMARY KEY,
     "flowcell_id" varchar(20) NOT NULL UNIQUE,
@@ -42,8 +51,8 @@ def main(cmdline=None):
     "lane_6_cluster_estimate" integer NULL,
     "lane_7_cluster_estimate" integer NULL,
     "lane_8_cluster_estimate" integer NULL,
-    "cluster_mac_id" varchar(50) NOT NULL,
-    "seq_mac_id" varchar(50) NOT NULL,
+    "cluster_station_id" integer NOT NULL REFERENCES "experiments_clusterstation" ("id"),
+    "sequencer_id" integer NOT NULL REFERENCES "experiments_sequencer" ("id"),
     "notes" text NOT NULL
 );''')
     c.execute('''insert into experiments_flowcell 
@@ -56,7 +65,7 @@ def main(cmdline=None):
          lane_2_cluster_estimate, lane_3_cluster_estimate, 
          lane_4_cluster_estimate, lane_5_cluster_estimate,
          lane_6_cluster_estimate, lane_7_cluster_estimate, 
-         lane_8_cluster_estimate, cluster_mac_id, seq_mac_id,
+         lane_8_cluster_estimate, cluster_station_id, sequencer_id,
          notes) 
       select
          id, flowcell_id, run_date, advanced_run, paired_end, read_length,
@@ -68,7 +77,7 @@ def main(cmdline=None):
          lane_2_cluster_estimate, lane_3_cluster_estimate, 
          lane_4_cluster_estimate, lane_5_cluster_estimate,
          lane_6_cluster_estimate, lane_7_cluster_estimate, 
-         lane_8_cluster_estimate, "", "",
+         lane_8_cluster_estimate, 1, 1,
          notes from fctracker_flowcell;''')
     c.execute('''drop table fctracker_flowcell;''')
 
@@ -88,17 +97,31 @@ def main(cmdline=None):
     "notes" text NOT NULL);''')
     c.execute('''insert into samples_condition (condition_name,notes) values("Unknown","Unknown");''')
 
+    # create samples.experiment type
+    c.execute('''CREATE TABLE "samples_experimenttype" (
+          "id" integer NOT NULL PRIMARY KEY,
+          "name" varchar(50) NOT NULL UNIQUE);''')
+    for et in [ ('Unknown',),
+                ('ChIP-seq',),
+                ('Sheared',),
+                ('RNA-seq',),
+                ('Methyl-seq',),
+                ('DIP-seq',),
+                ('De Novo',)]:
+        c.execute('insert into samples_experimenttype (name) values (?)', et)
+
     # create samples.library
     c.execute('''CREATE TABLE "samples_library" (
     "id" integer NOT NULL PRIMARY KEY,
     "library_id" varchar(30) NOT NULL,
     "library_name" varchar(100) NOT NULL UNIQUE,
     "library_species_id" integer NOT NULL REFERENCES "samples_species" ("id"),
+    "hidden" bool NOT NULL,
     "cell_line_id" integer NOT NULL REFERENCES "samples_cellline" ("id"),
     "condition_id" integer NOT NULL REFERENCES "samples_condition" ("id"),
     "antibody_id" integer NULL REFERENCES "samples_antibody" ("id"),
     "replicate" smallint unsigned NOT NULL,
-    "experiment_type" varchar(50) NOT NULL,
+    "experiment_type_id" NOT NULL REFERENCES "samples_experimenttype" ("id"),
     "creation_date" date NULL,
     "made_for" varchar(50) NOT NULL,
     "made_by" varchar(50) NOT NULL,
@@ -110,25 +133,36 @@ def main(cmdline=None):
     "avg_lib_size" integer NULL,
     "notes" text NOT NULL);''')
     c.execute('''INSERT INTO samples_library 
-      (id,library_id,library_name,library_species_id, experiment_type,
+      (id,library_id,library_name,library_species_id, hidden, experiment_type_id,
        cell_line_id,condition_id,replicate,made_by,creation_date,
        made_for,stopping_point,amplified_from_sample_id,
        undiluted_concentration,ten_nM_dilution,successful_pM,
        avg_lib_size,notes) 
-select library_id,library_id,library_name,library_species_id,"unknown",
+select library_id,library_id,library_name,library_species_id, 0, 1,
        1,           1,           1,        made_by,creation_date,
        made_for,stopping_point,amplified_from_sample_id,
        undiluted_concentration,ten_nM_dilution,successful_pM,
-       0,notes from fctracker_library;''');
-    c.execute('''update samples_library set experiment_type="RNA-seq" where library_id in (select library_id from fctracker_library where RNASeq = 1);''')
+       225,notes from fctracker_library;''');
+
+    # mark gel isolates as "hidden"
+    c.execute('''update samples_library set hidden=1  
+              where stopping_point = "1A" or stopping_point = "1Ab";''');
+
+    # get pk for RNA-seq experiment type
+    c.execute('select id from samples_experimenttype where name = "RNA-seq";')
+    rna_seq_id = list(c)[0]
+    # change everything marked as rnaseq to experiment_type rnaseq
+    c.execute('''update samples_library set experiment_type_id=?  where library_id in (select library_id from fctracker_library where RNASeq = 1);''', rna_seq_id)
     #c.execute('''drop table fctracker_library;''') 
-    # add many to many tables
+
+    # add affiliation linking table
     c.execute('''CREATE TABLE "samples_library_affiliations" (
     "id" integer NOT NULL PRIMARY KEY,
     "library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
     "affiliation_id" integer NOT NULL REFERENCES "samples_affiliation" ("id"),
     UNIQUE ("library_id", "affiliation_id"));''')
 
+    # add library to tags linking table
     c.execute('''CREATE TABLE "samples_library_tags" (
     "id" integer NOT NULL PRIMARY KEY,
     "library_id" integer NOT NULL REFERENCES "samples_library" ("id"),