Attempt to be robust to not having an alignment in our RunXml file
[htsworkflow.git] / docs / conv_caltech_v0.1_to_htsw.py
1 import shutil
2 import sqlite3
3 import sys
4
5 def main(cmdline=None):
6     if len(cmdline) == 1:
7        dest='/tmp/fctracker.db'
8     else:
9       dest = cmdline[1]
10     shutil.copy(cmdline[0], dest)
11     conn = sqlite3.connect(dest)
12     c = conn.cursor()
13     c.execute('drop table fctracker_elandresult');
14     c.execute('''CREATE TABLE "experiments_clusterstation" (
15       "id" integer NOT NULL PRIMARY KEY,
16       "name" varchar(50) NOT NULL UNIQUE);''')
17     c.execute('''INSERT INTO experiments_clusterstation (name) values ("station");''')
18     c.execute('''CREATE TABLE "experiments_sequencer" (
19       "id" integer NOT NULL PRIMARY KEY,
20       "name" varchar(50) NOT NULL UNIQUE);''')
21     c.execute('''INSERT INTO experiments_sequencer (name) values ("HWI-EAS229");''')
22
23     c.execute('''CREATE TABLE "experiments_flowcell" (
24     "id" integer NOT NULL PRIMARY KEY,
25     "flowcell_id" varchar(20) NOT NULL UNIQUE,
26     "run_date" datetime NOT NULL,
27     "advanced_run" bool NOT NULL,
28     "paired_end" bool NOT NULL,
29     "read_length" integer NOT NULL,
30     "lane_1_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
31     "lane_2_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
32     "lane_3_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
33     "lane_4_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
34     "lane_5_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
35     "lane_6_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
36     "lane_7_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
37     "lane_8_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
38     "lane_1_pM" decimal NOT NULL,
39     "lane_2_pM" decimal NOT NULL,
40     "lane_3_pM" decimal NOT NULL,
41     "lane_4_pM" decimal NOT NULL,
42     "lane_5_pM" decimal NOT NULL,
43     "lane_6_pM" decimal NOT NULL,
44     "lane_7_pM" decimal NOT NULL,
45     "lane_8_pM" decimal NOT NULL,
46     "lane_1_cluster_estimate" integer NULL,
47     "lane_2_cluster_estimate" integer NULL,
48     "lane_3_cluster_estimate" integer NULL,
49     "lane_4_cluster_estimate" integer NULL,
50     "lane_5_cluster_estimate" integer NULL,
51     "lane_6_cluster_estimate" integer NULL,
52     "lane_7_cluster_estimate" integer NULL,
53     "lane_8_cluster_estimate" integer NULL,
54     "cluster_station_id" integer NOT NULL REFERENCES "experiments_clusterstation" ("id"),
55     "sequencer_id" integer NOT NULL REFERENCES "experiments_sequencer" ("id"),
56     "notes" text NOT NULL
57 );''')
58     c.execute('''insert into experiments_flowcell 
59         (id, flowcell_id, run_date, advanced_run, paired_end, read_length,
60          lane_1_library_id, lane_2_library_id, lane_3_library_id,
61          lane_4_library_id, lane_5_library_id, lane_6_library_id,
62          lane_7_library_id, lane_8_library_id, lane_1_pm,
63          lane_2_pM, lane_3_pM, lane_4_pM, lane_5_pM, lane_6_pM,
64          lane_7_pM, lane_8_pM, lane_1_cluster_estimate,
65          lane_2_cluster_estimate, lane_3_cluster_estimate, 
66          lane_4_cluster_estimate, lane_5_cluster_estimate,
67          lane_6_cluster_estimate, lane_7_cluster_estimate, 
68          lane_8_cluster_estimate, cluster_station_id, sequencer_id,
69          notes) 
70       select
71          id, flowcell_id, run_date, advanced_run, paired_end, read_length,
72          lane_1_library_id, lane_2_library_id, lane_3_library_id,
73          lane_4_library_id, lane_5_library_id, lane_6_library_id,
74          lane_7_library_id, lane_8_library_id, lane_1_pm,
75          lane_2_pM, lane_3_pM, lane_4_pM, lane_5_pM, lane_6_pM,
76          lane_7_pM, lane_8_pM, lane_1_cluster_estimate,
77          lane_2_cluster_estimate, lane_3_cluster_estimate, 
78          lane_4_cluster_estimate, lane_5_cluster_estimate,
79          lane_6_cluster_estimate, lane_7_cluster_estimate, 
80          lane_8_cluster_estimate, 1, 1,
81          notes from fctracker_flowcell;''')
82     c.execute('''drop table fctracker_flowcell;''')
83
84     # create samples.cellline
85     c.execute('''CREATE TABLE "samples_cellline" (
86     "id" integer NOT NULL PRIMARY KEY,
87     "cellline_name" varchar(100) NOT NULL UNIQUE,
88     "nickname" varchar(20) NULL,
89     "notes" text NOT NULL);''')
90     c.execute('''insert into samples_cellline (cellline_name,notes) values("Unknown","Unknown");''')
91
92     # Create samples.condition
93     c.execute('''CREATE TABLE "samples_condition" (
94     "id" integer NOT NULL PRIMARY KEY,
95     "condition_name" varchar(2000) NOT NULL UNIQUE,
96     "nickname" varchar(20) NULL,
97     "notes" text NOT NULL);''')
98     c.execute('''insert into samples_condition (condition_name,notes) values("Unknown","Unknown");''')
99
100     # create samples.experiment type
101     c.execute('''CREATE TABLE "samples_experimenttype" (
102           "id" integer NOT NULL PRIMARY KEY,
103           "name" varchar(50) NOT NULL UNIQUE);''')
104     for et in [ ('Unknown',),
105                 ('ChIP-seq',),
106                 ('Sheared',),
107                 ('RNA-seq',),
108                 ('Methyl-seq',),
109                 ('DIP-seq',),
110                 ('De Novo',)]:
111         c.execute('insert into samples_experimenttype (name) values (?)', et)
112
113     # create samples.library
114     c.execute('''CREATE TABLE "samples_library" (
115     "id" integer NOT NULL PRIMARY KEY,
116     "library_id" varchar(30) NOT NULL,
117     "library_name" varchar(100) NOT NULL UNIQUE,
118     "library_species_id" integer NOT NULL REFERENCES "samples_species" ("id"),
119     "hidden" bool NOT NULL,
120     "cell_line_id" integer NOT NULL REFERENCES "samples_cellline" ("id"),
121     "condition_id" integer NOT NULL REFERENCES "samples_condition" ("id"),
122     "antibody_id" integer NULL REFERENCES "samples_antibody" ("id"),
123     "replicate" smallint unsigned NOT NULL,
124     "experiment_type_id" NOT NULL REFERENCES "samples_experimenttype" ("id"),
125     "creation_date" date NULL,
126     "made_for" varchar(50) NOT NULL,
127     "made_by" varchar(50) NOT NULL,
128     "stopping_point" varchar(25) NOT NULL,
129     "amplified_from_sample_id" integer NULL,
130     "undiluted_concentration" decimal NULL,
131     "successful_pM" decimal NULL,
132     "ten_nM_dilution" bool NOT NULL,
133     "avg_lib_size" integer NULL,
134     "notes" text NOT NULL);''')
135     c.execute('''INSERT INTO samples_library 
136       (id,library_id,library_name,library_species_id, hidden, experiment_type_id,
137        cell_line_id,condition_id,replicate,made_by,creation_date,
138        made_for,stopping_point,amplified_from_sample_id,
139        undiluted_concentration,ten_nM_dilution,successful_pM,
140        avg_lib_size,notes) 
141 select library_id,library_id,library_name,library_species_id, 0, 1,
142        1,           1,           1,        made_by,creation_date,
143        made_for,stopping_point,amplified_from_sample_id,
144        undiluted_concentration,ten_nM_dilution,successful_pM,
145        225,notes from fctracker_library;''');
146
147     # mark gel isolates as "hidden"
148     c.execute('''update samples_library set hidden=1  
149               where stopping_point = "1A" or stopping_point = "1Ab";''');
150
151     # get pk for RNA-seq experiment type
152     c.execute('select id from samples_experimenttype where name = "RNA-seq";')
153     rna_seq_id = list(c)[0]
154     # change everything marked as rnaseq to experiment_type rnaseq
155     c.execute('''update samples_library set experiment_type_id=?  where library_id in (select library_id from fctracker_library where RNASeq = 1);''', rna_seq_id)
156     #c.execute('''drop table fctracker_library;''') 
157
158     # add affiliation linking table
159     c.execute('''CREATE TABLE "samples_library_affiliations" (
160     "id" integer NOT NULL PRIMARY KEY,
161     "library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
162     "affiliation_id" integer NOT NULL REFERENCES "samples_affiliation" ("id"),
163     UNIQUE ("library_id", "affiliation_id"));''')
164
165     # add library to tags linking table
166     c.execute('''CREATE TABLE "samples_library_tags" (
167     "id" integer NOT NULL PRIMARY KEY,
168     "library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
169     "tag_id" integer NOT NULL REFERENCES "samples_tag" ("id"),
170     UNIQUE ("library_id", "tag_id"));''')
171
172
173
174     #
175     c.execute('''CREATE TABLE "samples_species" (
176     "id" integer NOT NULL PRIMARY KEY,
177     "scientific_name" varchar(256) NOT NULL,
178     "common_name" varchar(256) NOT NULL);''')
179     c.execute('''insert into samples_species 
180         (id, scientific_name, common_name)
181     select
182          id, scientific_name, common_name
183     from fctracker_species;''')
184     c.execute('''drop table fctracker_species''')
185     conn.commit()
186
187 if __name__ == "__main__":
188     main(sys.argv[1:])