Add my script to convert Caltech's made_for field to htsworkflow 0.2's affiliation...
[htsworkflow.git] / docs / conv_caltech_v0.1_to_htsw.py
1 import shutil
2 import sqlite3
3 import sys
4
5 def main(cmdline=None):
6     if len(cmdline) == 1:
7        dest='/tmp/fctracker.db'
8     else:
9       dest = cmdline[1]
10     shutil.copy(cmdline[0], dest)
11     conn = sqlite3.connect(dest)
12     c = conn.cursor()
13     c.execute('drop table fctracker_elandresult');
14     c.execute('''CREATE TABLE "experiments_flowcell" (
15     "id" integer NOT NULL PRIMARY KEY,
16     "flowcell_id" varchar(20) NOT NULL UNIQUE,
17     "run_date" datetime NOT NULL,
18     "advanced_run" bool NOT NULL,
19     "paired_end" bool NOT NULL,
20     "read_length" integer NOT NULL,
21     "lane_1_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
22     "lane_2_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
23     "lane_3_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
24     "lane_4_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
25     "lane_5_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
26     "lane_6_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
27     "lane_7_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
28     "lane_8_library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
29     "lane_1_pM" decimal NOT NULL,
30     "lane_2_pM" decimal NOT NULL,
31     "lane_3_pM" decimal NOT NULL,
32     "lane_4_pM" decimal NOT NULL,
33     "lane_5_pM" decimal NOT NULL,
34     "lane_6_pM" decimal NOT NULL,
35     "lane_7_pM" decimal NOT NULL,
36     "lane_8_pM" decimal NOT NULL,
37     "lane_1_cluster_estimate" integer NULL,
38     "lane_2_cluster_estimate" integer NULL,
39     "lane_3_cluster_estimate" integer NULL,
40     "lane_4_cluster_estimate" integer NULL,
41     "lane_5_cluster_estimate" integer NULL,
42     "lane_6_cluster_estimate" integer NULL,
43     "lane_7_cluster_estimate" integer NULL,
44     "lane_8_cluster_estimate" integer NULL,
45     "cluster_mac_id" varchar(50) NOT NULL,
46     "seq_mac_id" varchar(50) NOT NULL,
47     "notes" text NOT NULL
48 );''')
49     c.execute('''insert into experiments_flowcell 
50         (id, flowcell_id, run_date, advanced_run, paired_end, read_length,
51          lane_1_library_id, lane_2_library_id, lane_3_library_id,
52          lane_4_library_id, lane_5_library_id, lane_6_library_id,
53          lane_7_library_id, lane_8_library_id, lane_1_pm,
54          lane_2_pM, lane_3_pM, lane_4_pM, lane_5_pM, lane_6_pM,
55          lane_7_pM, lane_8_pM, lane_1_cluster_estimate,
56          lane_2_cluster_estimate, lane_3_cluster_estimate, 
57          lane_4_cluster_estimate, lane_5_cluster_estimate,
58          lane_6_cluster_estimate, lane_7_cluster_estimate, 
59          lane_8_cluster_estimate, cluster_mac_id, seq_mac_id,
60          notes) 
61       select
62          id, flowcell_id, run_date, advanced_run, paired_end, read_length,
63          lane_1_library_id, lane_2_library_id, lane_3_library_id,
64          lane_4_library_id, lane_5_library_id, lane_6_library_id,
65          lane_7_library_id, lane_8_library_id, lane_1_pm,
66          lane_2_pM, lane_3_pM, lane_4_pM, lane_5_pM, lane_6_pM,
67          lane_7_pM, lane_8_pM, lane_1_cluster_estimate,
68          lane_2_cluster_estimate, lane_3_cluster_estimate, 
69          lane_4_cluster_estimate, lane_5_cluster_estimate,
70          lane_6_cluster_estimate, lane_7_cluster_estimate, 
71          lane_8_cluster_estimate, "", "",
72          notes from fctracker_flowcell;''')
73     c.execute('''drop table fctracker_flowcell;''')
74
75     # create samples.cellline
76     c.execute('''CREATE TABLE "samples_cellline" (
77     "id" integer NOT NULL PRIMARY KEY,
78     "cellline_name" varchar(100) NOT NULL UNIQUE,
79     "nickname" varchar(20) NULL,
80     "notes" text NOT NULL);''')
81     c.execute('''insert into samples_cellline (cellline_name,notes) values("Unknown","Unknown");''')
82
83     # Create samples.condition
84     c.execute('''CREATE TABLE "samples_condition" (
85     "id" integer NOT NULL PRIMARY KEY,
86     "condition_name" varchar(2000) NOT NULL UNIQUE,
87     "nickname" varchar(20) NULL,
88     "notes" text NOT NULL);''')
89     c.execute('''insert into samples_condition (condition_name,notes) values("Unknown","Unknown");''')
90
91     # create samples.library
92     c.execute('''CREATE TABLE "samples_library" (
93     "id" integer NOT NULL PRIMARY KEY,
94     "library_id" varchar(30) NOT NULL,
95     "library_name" varchar(100) NOT NULL UNIQUE,
96     "library_species_id" integer NOT NULL REFERENCES "samples_species" ("id"),
97     "cell_line_id" integer NOT NULL REFERENCES "samples_cellline" ("id"),
98     "condition_id" integer NOT NULL REFERENCES "samples_condition" ("id"),
99     "antibody_id" integer NULL REFERENCES "samples_antibody" ("id"),
100     "replicate" smallint unsigned NOT NULL,
101     "experiment_type" varchar(50) NOT NULL,
102     "creation_date" date NULL,
103     "made_for" varchar(50) NOT NULL,
104     "made_by" varchar(50) NOT NULL,
105     "stopping_point" varchar(25) NOT NULL,
106     "amplified_from_sample_id" integer NULL,
107     "undiluted_concentration" decimal NULL,
108     "successful_pM" decimal NULL,
109     "ten_nM_dilution" bool NOT NULL,
110     "avg_lib_size" integer NULL,
111     "notes" text NOT NULL);''')
112     c.execute('''INSERT INTO samples_library 
113       (id,library_id,library_name,library_species_id, experiment_type,
114        cell_line_id,condition_id,replicate,made_by,creation_date,
115        made_for,stopping_point,amplified_from_sample_id,
116        undiluted_concentration,ten_nM_dilution,successful_pM,
117        avg_lib_size,notes) 
118 select library_id,library_id,library_name,library_species_id,"unknown",
119        1,           1,           1,        made_by,creation_date,
120        made_for,stopping_point,amplified_from_sample_id,
121        undiluted_concentration,ten_nM_dilution,successful_pM,
122        0,notes from fctracker_library;''');
123     c.execute('''update samples_library set experiment_type="RNA-seq" where library_id in (select library_id from fctracker_library where RNASeq = 1);''')
124     #c.execute('''drop table fctracker_library;''') 
125     # add many to many tables
126     c.execute('''CREATE TABLE "samples_library_affiliations" (
127     "id" integer NOT NULL PRIMARY KEY,
128     "library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
129     "affiliation_id" integer NOT NULL REFERENCES "samples_affiliation" ("id"),
130     UNIQUE ("library_id", "affiliation_id"));''')
131
132     c.execute('''CREATE TABLE "samples_library_tags" (
133     "id" integer NOT NULL PRIMARY KEY,
134     "library_id" integer NOT NULL REFERENCES "samples_library" ("id"),
135     "tag_id" integer NOT NULL REFERENCES "samples_tag" ("id"),
136     UNIQUE ("library_id", "tag_id"));''')
137
138
139
140     #
141     c.execute('''CREATE TABLE "samples_species" (
142     "id" integer NOT NULL PRIMARY KEY,
143     "scientific_name" varchar(256) NOT NULL,
144     "common_name" varchar(256) NOT NULL);''')
145     c.execute('''insert into samples_species 
146         (id, scientific_name, common_name)
147     select
148          id, scientific_name, common_name
149     from fctracker_species;''')
150     c.execute('''drop table fctracker_species''')
151     conn.commit()
152
153 if __name__ == "__main__":
154     main(sys.argv[1:])