5 Copyright (c) 2008 Genome Research Ltd (GRL).
7 Permission is hereby granted, free of charge, to any person obtaining
8 a copy of this software and associated documentation files (the
9 "Software"), to deal in the Software without restriction, including
10 without limitation the rights to use, copy, modify, merge, publish,
11 distribute, sublicense, and/or sell copies of the Software, and to
12 permit persons to whom the Software is furnished to do so, subject to
13 the following conditions:
15 The above copyright notice and this permission notice shall be
16 included in all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
22 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
23 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 /* Contact: Heng Li <lh3@sanger.ac.uk> */
30 /* Probably I will not do socket programming in the next few years and
31 therefore I decide to heavily annotate this file, for Linux and
32 Windows as well. -lh3 */
41 #include <sys/types.h>
47 #include <arpa/inet.h>
48 #include <sys/socket.h>
53 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
54 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
55 * integer -1. In knetfile.c, I use "int" for socket type
56 * throughout. This should be improved to avoid confusion.
58 * In Linux/Mac, recv() and read() do almost the same thing. You can see
59 * in the header file that netread() is simply an alias of read(). In
60 * Windows, however, they are different and using recv() is mandatory.
63 /* This function tests if the file handler is ready for reading (or
64 * writing if is_read==0). */
65 static int socket_wait(int fd, int is_read)
67 fd_set fds, *fdr = 0, *fdw = 0;
70 tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
73 if (is_read) fdr = &fds;
75 ret = select(fd+1, fdr, fdw, 0, &tv);
77 if (ret == -1) perror("select");
80 fprintf(pysamerr, "select time-out\n");
81 else if (ret == SOCKET_ERROR)
82 fprintf(pysamerr, "select: %d\n", WSAGetLastError());
88 /* This function does not work with Windows due to the lack of
89 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
90 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
91 static int socket_connect(const char *host, const char *port)
93 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
96 struct linger lng = { 0, 0 };
97 struct addrinfo hints, *res;
98 memset(&hints, 0, sizeof(struct addrinfo));
99 hints.ai_family = AF_UNSPEC;
100 hints.ai_socktype = SOCK_STREAM;
101 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
102 * server information. */
103 if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
104 if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
105 /* The following two setsockopt() are used by ftplib
106 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
108 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
109 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
110 if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
115 /* MinGW's printf has problem with "%lld" */
116 char *int64tostr(char *buf, int64_t x)
121 buf[i++] = '0' + x % 10;
125 for (cnt = i, i = 0; i < cnt/2; ++i) {
126 int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
131 int64_t strtoint64(const char *buf)
134 for (x = 0; *buf != '\0'; ++buf)
135 x = x * 10 + ((int64_t) *buf - 48);
138 /* In windows, the first thing is to establish the TCP connection. */
139 int knet_win32_init()
142 return WSAStartup(MAKEWORD(2, 2), &wsaData);
144 void knet_win32_destroy()
148 /* A slightly modfied version of the following function also works on
149 * Mac (and presummably Linux). However, this function is not stable on
150 * my Mac. It sometimes works fine but sometimes does not. Therefore for
151 * non-Windows OS, I do not use this one. */
152 static SOCKET socket_connect(const char *host, const char *port)
154 #define __err_connect(func) \
156 fprintf(pysamerr, "%s: %d\n", func, WSAGetLastError()); \
162 struct linger lng = { 0, 0 };
163 struct sockaddr_in server;
164 struct hostent *hp = 0;
166 if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
167 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
168 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
170 if (isalpha(host[0])) hp = gethostbyname(host);
173 addr.s_addr = inet_addr(host);
174 hp = gethostbyaddr((char*)&addr, 4, AF_INET);
176 if (hp == 0) __err_connect("gethost");
178 server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
179 server.sin_family= AF_INET;
180 server.sin_port = htons(atoi(port));
181 if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
182 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
187 static off_t my_netread(int fd, void *buf, off_t len)
189 off_t rest = len, curr, l = 0;
190 /* recv() and read() may not read the required length of data with
191 * one call. They have to be called repeatedly. */
193 if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
194 curr = netread(fd, buf + l, rest);
195 /* According to the glibc manual, section 13.2, a zero returned
196 * value indicates end-of-file (EOF), which should mean that
197 * read() will not return zero if EOF has not been met but data
198 * are not immediately available. */
199 if (curr == 0) break;
200 l += curr; rest -= curr;
205 /*************************
206 * FTP specific routines *
207 *************************/
209 static int kftp_get_response(knetFile *ftp)
218 if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
219 while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
220 //fputc(c, pysamerr);
221 if (n >= ftp->max_response) {
222 ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
223 ftp->response = realloc(ftp->response, ftp->max_response);
225 ftp->response[n++] = c;
227 if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
228 && ftp->response[3] != '-') break;
233 if (n < 2) return -1;
234 ftp->response[n-2] = 0;
235 return strtol(ftp->response, &p, 0);
238 static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
240 if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
241 netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
242 return is_get? kftp_get_response(ftp) : 0;
245 static int kftp_pasv_prep(knetFile *ftp)
249 kftp_send_cmd(ftp, "PASV\r\n", 1);
250 for (p = ftp->response; *p && *p != '('; ++p);
251 if (*p != '(') return -1;
253 sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
254 memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
255 ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
260 static int kftp_pasv_connect(knetFile *ftp)
262 char host[80], port[10];
263 if (ftp->pasv_port == 0) {
264 fprintf(pysamerr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
267 sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
268 sprintf(port, "%d", ftp->pasv_port);
269 ftp->fd = socket_connect(host, port);
270 if (ftp->fd == -1) return -1;
274 int kftp_connect(knetFile *ftp)
276 ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
277 if (ftp->ctrl_fd == -1) return -1;
278 kftp_get_response(ftp);
279 kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
280 kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
281 kftp_send_cmd(ftp, "TYPE I\r\n", 1);
285 int kftp_reconnect(knetFile *ftp)
287 if (ftp->ctrl_fd != -1) {
288 netclose(ftp->ctrl_fd);
293 return kftp_connect(ftp);
296 // initialize ->type, ->host, ->retr and ->size
297 knetFile *kftp_parse_url(const char *fn, const char *mode)
302 if (strstr(fn, "ftp://") != fn) return 0;
303 for (p = (char*)fn + 6; *p && *p != '/'; ++p);
304 if (*p != '/') return 0;
306 fp = calloc(1, sizeof(knetFile));
307 fp->type = KNF_TYPE_FTP;
309 /* the Linux/Mac version of socket_connect() also recognizes a port
310 * like "ftp", but the Windows version does not. */
311 fp->port = strdup("21");
312 fp->host = calloc(l + 1, 1);
313 if (strchr(mode, 'c')) fp->no_reconnect = 1;
314 strncpy(fp->host, fn + 6, l);
315 fp->retr = calloc(strlen(p) + 8, 1);
316 sprintf(fp->retr, "RETR %s\r\n", p);
317 fp->size_cmd = calloc(strlen(p) + 8, 1);
318 sprintf(fp->size_cmd, "SIZE %s\r\n", p);
322 // place ->fd at offset off
323 int kftp_connect_file(knetFile *fp)
329 if (fp->no_reconnect) kftp_get_response(fp);
332 kftp_send_cmd(fp, fp->size_cmd, 1);
334 if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
336 fprintf(pysamerr,"[kftp_connect_file] %s\n", fp->response);
340 const char *p = fp->response;
341 while (*p != ' ') ++p;
342 while (*p < '0' || *p > '9') ++p;
343 file_size = strtoint64(p);
345 fp->file_size = file_size;
349 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
351 strcpy(tmp, "REST ");
352 int64tostr(tmp + 5, fp->offset);
355 kftp_send_cmd(fp, tmp, 1);
357 kftp_send_cmd(fp, fp->retr, 0);
358 kftp_pasv_connect(fp);
359 ret = kftp_get_response(fp);
361 fprintf(pysamerr, "[kftp_connect_file] %s\n", fp->response);
371 /**************************
372 * HTTP specific routines *
373 **************************/
375 knetFile *khttp_parse_url(const char *fn, const char *mode)
380 if (strstr(fn, "http://") != fn) return 0;
382 for (p = (char*)fn + 7; *p && *p != '/'; ++p);
384 fp = calloc(1, sizeof(knetFile));
385 fp->http_host = calloc(l + 1, 1);
386 strncpy(fp->http_host, fn + 7, l);
387 fp->http_host[l] = 0;
388 for (q = fp->http_host; *q && *q != ':'; ++q);
389 if (*q == ':') *q++ = 0;
391 proxy = getenv("http_proxy");
392 // set ->host, ->port and ->path
394 fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
395 fp->port = strdup(*q? q : "80");
396 fp->path = strdup(*p? p : "/");
398 fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
399 for (q = fp->host; *q && *q != ':'; ++q);
400 if (*q == ':') *q++ = 0;
401 fp->port = strdup(*q? q : "80");
402 fp->path = strdup(fn);
404 fp->type = KNF_TYPE_HTTP;
405 fp->ctrl_fd = fp->fd = -1;
410 int khttp_connect_file(knetFile *fp)
414 if (fp->fd != -1) netclose(fp->fd);
415 fp->fd = socket_connect(fp->host, fp->port);
416 buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
417 l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
418 l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
419 l += sprintf(buf + l, "\r\n");
420 netwrite(fp->fd, buf, l);
422 while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
423 if (buf[l] == '\n' && l >= 3)
424 if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
428 if (l < 14) { // prematured header
433 ret = strtol(buf + 8, &p, 0); // HTTP return code
434 if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
435 off_t rest = fp->offset;
437 off_t l = rest < 0x10000? rest : 0x10000;
438 rest -= my_netread(fp->fd, buf, l);
440 } else if (ret != 206 && ret != 200) {
442 fprintf(pysamerr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
452 /********************
454 ********************/
456 knetFile *knet_open(const char *fn, const char *mode)
459 if (mode[0] != 'r') {
460 fprintf(pysamerr, "[kftp_open] only mode \"r\" is supported.\n");
463 if (strstr(fn, "ftp://") == fn) {
464 fp = kftp_parse_url(fn, mode);
465 if (fp == 0) return 0;
466 if (kftp_connect(fp) == -1) {
470 kftp_connect_file(fp);
471 } else if (strstr(fn, "http://") == fn) {
472 fp = khttp_parse_url(fn, mode);
473 if (fp == 0) return 0;
474 khttp_connect_file(fp);
475 } else { // local file
477 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
478 * be undefined on some systems, although it is defined on my
479 * Mac and the Linux I have tested on. */
480 int fd = open(fn, O_RDONLY | O_BINARY);
482 int fd = open(fn, O_RDONLY);
488 fp = (knetFile*)calloc(1, sizeof(knetFile));
489 fp->type = KNF_TYPE_LOCAL;
493 if (fp && fp->fd == -1) {
500 knetFile *knet_dopen(int fd, const char *mode)
502 knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
503 fp->type = KNF_TYPE_LOCAL;
508 off_t knet_read(knetFile *fp, void *buf, off_t len)
511 if (fp->fd == -1) return 0;
512 if (fp->type == KNF_TYPE_FTP) {
513 if (fp->is_ready == 0) {
514 if (!fp->no_reconnect) kftp_reconnect(fp);
515 kftp_connect_file(fp);
517 } else if (fp->type == KNF_TYPE_HTTP) {
518 if (fp->is_ready == 0)
519 khttp_connect_file(fp);
521 if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
522 off_t rest = len, curr;
524 curr = read(fp->fd, buf + l, rest);
525 if (curr == 0) break;
526 l += curr; rest -= curr;
528 } else l = my_netread(fp->fd, buf, len);
533 off_t knet_seek(knetFile *fp, int64_t off, int whence)
535 if (whence == SEEK_SET && off == fp->offset) return 0;
536 if (fp->type == KNF_TYPE_LOCAL) {
537 /* Be aware that lseek() returns the offset after seeking,
538 * while fseek() returns zero on success. */
539 off_t offset = lseek(fp->fd, off, whence);
541 // Be silent, it is OK for knet_seek to fail when the file is streamed
542 // fprintf(pysamerr,"[knet_seek] %s\n", strerror(errno));
548 else if (fp->type == KNF_TYPE_FTP)
550 if (whence==SEEK_CUR)
552 else if (whence==SEEK_SET)
554 else if ( whence==SEEK_END)
555 fp->offset = fp->file_size+off;
559 else if (fp->type == KNF_TYPE_HTTP)
561 if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
562 fprintf(pysamerr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
566 if (whence==SEEK_CUR)
568 else if (whence==SEEK_SET)
574 fprintf(pysamerr,"[knet_seek] %s\n", strerror(errno));
578 int knet_close(knetFile *fp)
580 if (fp == 0) return 0;
581 if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
583 /* On Linux/Mac, netclose() is an alias of close(), but on
584 * Windows, it is an alias of closesocket(). */
585 if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
586 else netclose(fp->fd);
588 free(fp->host); free(fp->port);
589 free(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific
590 free(fp->path); free(fp->http_host); // HTTP specific
604 buf = calloc(0x100000, 1);
606 fp = knet_open("knetfile.c", "r");
607 knet_seek(fp, 1000, SEEK_SET);
608 } else if (type == 1) { // NCBI FTP, large file
609 fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
610 knet_seek(fp, 2500000000ll, SEEK_SET);
611 l = knet_read(fp, buf, 255);
612 } else if (type == 2) {
613 fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
614 knet_seek(fp, 1000, SEEK_SET);
615 } else if (type == 3) {
616 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
617 knet_seek(fp, 1000, SEEK_SET);
618 } else if (type == 4) {
619 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
620 knet_read(fp, buf, 10000);
621 knet_seek(fp, 20000, SEEK_SET);
622 knet_seek(fp, 10000, SEEK_SET);
623 l = knet_read(fp, buf+10000, 10000000) + 10000;
625 if (type != 4 && type != 1) {
626 knet_read(fp, buf, 255);
629 } else write(fileno(stdout), buf, l);