5 Copyright (c) 2008 by Genome Research Ltd (GRL).
6 2010 by Attractive Chaos <attractor@live.co.uk>
8 Permission is hereby granted, free of charge, to any person obtaining
9 a copy of this software and associated documentation files (the
10 "Software"), to deal in the Software without restriction, including
11 without limitation the rights to use, copy, modify, merge, publish,
12 distribute, sublicense, and/or sell copies of the Software, and to
13 permit persons to whom the Software is furnished to do so, subject to
14 the following conditions:
16 The above copyright notice and this permission notice shall be
17 included in all copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
23 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
24 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29 /* Probably I will not do socket programming in the next few years and
30 therefore I decide to heavily annotate this file, for Linux and
31 Windows as well. -ac */
40 #include <sys/types.h>
44 #include <arpa/inet.h>
45 #include <sys/socket.h>
50 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
51 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
52 * integer -1. In knetfile.c, I use "int" for socket type
53 * throughout. This should be improved to avoid confusion.
55 * In Linux/Mac, recv() and read() do almost the same thing. You can see
56 * in the header file that netread() is simply an alias of read(). In
57 * Windows, however, they are different and using recv() is mandatory.
60 /* This function tests if the file handler is ready for reading (or
61 * writing if is_read==0). */
62 static int socket_wait(int fd, int is_read)
64 fd_set fds, *fdr = 0, *fdw = 0;
67 tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
70 if (is_read) fdr = &fds;
72 ret = select(fd+1, fdr, fdw, 0, &tv);
74 if (ret == -1) perror("select");
77 fprintf(pysamerr, "select time-out\n");
78 else if (ret == SOCKET_ERROR)
79 fprintf(pysamerr, "select: %d\n", WSAGetLastError());
85 /* This function does not work with Windows due to the lack of
86 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
87 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
88 static int socket_connect(const char *host, const char *port)
90 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
93 struct linger lng = { 0, 0 };
94 struct addrinfo hints, *res = 0;
95 memset(&hints, 0, sizeof(struct addrinfo));
96 hints.ai_family = AF_UNSPEC;
97 hints.ai_socktype = SOCK_STREAM;
98 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
99 * server information. */
100 if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
101 if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
102 /* The following two setsockopt() are used by ftplib
103 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
105 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
106 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
107 if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
112 /* MinGW's printf has problem with "%lld" */
113 char *int64tostr(char *buf, int64_t x)
118 buf[i++] = '0' + x % 10;
122 for (cnt = i, i = 0; i < cnt/2; ++i) {
123 int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
128 int64_t strtoint64(const char *buf)
131 for (x = 0; *buf != '\0'; ++buf)
132 x = x * 10 + ((int64_t) *buf - 48);
135 /* In windows, the first thing is to establish the TCP connection. */
136 int knet_win32_init()
139 return WSAStartup(MAKEWORD(2, 2), &wsaData);
141 void knet_win32_destroy()
145 /* A slightly modfied version of the following function also works on
146 * Mac (and presummably Linux). However, this function is not stable on
147 * my Mac. It sometimes works fine but sometimes does not. Therefore for
148 * non-Windows OS, I do not use this one. */
149 static SOCKET socket_connect(const char *host, const char *port)
151 #define __err_connect(func) \
153 fprintf(pysamerr, "%s: %d\n", func, WSAGetLastError()); \
159 struct linger lng = { 0, 0 };
160 struct sockaddr_in server;
161 struct hostent *hp = 0;
163 if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
164 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
165 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
167 if (isalpha(host[0])) hp = gethostbyname(host);
170 addr.s_addr = inet_addr(host);
171 hp = gethostbyaddr((char*)&addr, 4, AF_INET);
173 if (hp == 0) __err_connect("gethost");
175 server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
176 server.sin_family= AF_INET;
177 server.sin_port = htons(atoi(port));
178 if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
179 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
184 static off_t my_netread(int fd, void *buf, off_t len)
186 off_t rest = len, curr, l = 0;
187 /* recv() and read() may not read the required length of data with
188 * one call. They have to be called repeatedly. */
190 if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
191 curr = netread(fd, buf + l, rest);
192 /* According to the glibc manual, section 13.2, a zero returned
193 * value indicates end-of-file (EOF), which should mean that
194 * read() will not return zero if EOF has not been met but data
195 * are not immediately available. */
196 if (curr == 0) break;
197 l += curr; rest -= curr;
202 /*************************
203 * FTP specific routines *
204 *************************/
206 static int kftp_get_response(knetFile *ftp)
215 if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
216 while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
217 //fputc(c, pysamerr);
218 if (n >= ftp->max_response) {
219 ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
220 ftp->response = realloc(ftp->response, ftp->max_response);
222 ftp->response[n++] = c;
224 if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
225 && ftp->response[3] != '-') break;
230 if (n < 2) return -1;
231 ftp->response[n-2] = 0;
232 return strtol(ftp->response, &p, 0);
235 static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
237 if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
238 netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
239 return is_get? kftp_get_response(ftp) : 0;
242 static int kftp_pasv_prep(knetFile *ftp)
246 kftp_send_cmd(ftp, "PASV\r\n", 1);
247 for (p = ftp->response; *p && *p != '('; ++p);
248 if (*p != '(') return -1;
250 sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
251 memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
252 ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
257 static int kftp_pasv_connect(knetFile *ftp)
259 char host[80], port[10];
260 if (ftp->pasv_port == 0) {
261 fprintf(pysamerr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
264 sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
265 sprintf(port, "%d", ftp->pasv_port);
266 ftp->fd = socket_connect(host, port);
267 if (ftp->fd == -1) return -1;
271 int kftp_connect(knetFile *ftp)
273 ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
274 if (ftp->ctrl_fd == -1) return -1;
275 kftp_get_response(ftp);
276 kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
277 kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
278 kftp_send_cmd(ftp, "TYPE I\r\n", 1);
282 int kftp_reconnect(knetFile *ftp)
284 if (ftp->ctrl_fd != -1) {
285 netclose(ftp->ctrl_fd);
290 return kftp_connect(ftp);
293 // initialize ->type, ->host, ->retr and ->size
294 knetFile *kftp_parse_url(const char *fn, const char *mode)
299 if (strstr(fn, "ftp://") != fn) return 0;
300 for (p = (char*)fn + 6; *p && *p != '/'; ++p);
301 if (*p != '/') return 0;
303 fp = calloc(1, sizeof(knetFile));
304 fp->type = KNF_TYPE_FTP;
306 /* the Linux/Mac version of socket_connect() also recognizes a port
307 * like "ftp", but the Windows version does not. */
308 fp->port = strdup("21");
309 fp->host = calloc(l + 1, 1);
310 if (strchr(mode, 'c')) fp->no_reconnect = 1;
311 strncpy(fp->host, fn + 6, l);
312 fp->retr = calloc(strlen(p) + 8, 1);
313 sprintf(fp->retr, "RETR %s\r\n", p);
314 fp->size_cmd = calloc(strlen(p) + 8, 1);
315 sprintf(fp->size_cmd, "SIZE %s\r\n", p);
319 // place ->fd at offset off
320 int kftp_connect_file(knetFile *fp)
326 if (fp->no_reconnect) kftp_get_response(fp);
329 kftp_send_cmd(fp, fp->size_cmd, 1);
331 if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
333 fprintf(pysamerr,"[kftp_connect_file] %s\n", fp->response);
337 const char *p = fp->response;
338 while (*p != ' ') ++p;
339 while (*p < '0' || *p > '9') ++p;
340 file_size = strtoint64(p);
342 fp->file_size = file_size;
346 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
348 strcpy(tmp, "REST ");
349 int64tostr(tmp + 5, fp->offset);
352 kftp_send_cmd(fp, tmp, 1);
354 kftp_send_cmd(fp, fp->retr, 0);
355 kftp_pasv_connect(fp);
356 ret = kftp_get_response(fp);
358 fprintf(pysamerr, "[kftp_connect_file] %s\n", fp->response);
368 /**************************
369 * HTTP specific routines *
370 **************************/
372 knetFile *khttp_parse_url(const char *fn, const char *mode)
377 if (strstr(fn, "http://") != fn) return 0;
379 for (p = (char*)fn + 7; *p && *p != '/'; ++p);
381 fp = calloc(1, sizeof(knetFile));
382 fp->http_host = calloc(l + 1, 1);
383 strncpy(fp->http_host, fn + 7, l);
384 fp->http_host[l] = 0;
385 for (q = fp->http_host; *q && *q != ':'; ++q);
386 if (*q == ':') *q++ = 0;
388 proxy = getenv("http_proxy");
389 // set ->host, ->port and ->path
391 fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
392 fp->port = strdup(*q? q : "80");
393 fp->path = strdup(*p? p : "/");
395 fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
396 for (q = fp->host; *q && *q != ':'; ++q);
397 if (*q == ':') *q++ = 0;
398 fp->port = strdup(*q? q : "80");
399 fp->path = strdup(fn);
401 fp->type = KNF_TYPE_HTTP;
402 fp->ctrl_fd = fp->fd = -1;
407 int khttp_connect_file(knetFile *fp)
411 if (fp->fd != -1) netclose(fp->fd);
412 fp->fd = socket_connect(fp->host, fp->port);
413 buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
414 l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
415 l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
416 l += sprintf(buf + l, "\r\n");
417 netwrite(fp->fd, buf, l);
419 while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
420 if (buf[l] == '\n' && l >= 3)
421 if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
425 if (l < 14) { // prematured header
430 ret = strtol(buf + 8, &p, 0); // HTTP return code
431 if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
432 off_t rest = fp->offset;
434 off_t l = rest < 0x10000? rest : 0x10000;
435 rest -= my_netread(fp->fd, buf, l);
437 } else if (ret != 206 && ret != 200) {
439 fprintf(pysamerr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
449 /********************
451 ********************/
453 knetFile *knet_open(const char *fn, const char *mode)
456 if (mode[0] != 'r') {
457 fprintf(pysamerr, "[kftp_open] only mode \"r\" is supported.\n");
460 if (strstr(fn, "ftp://") == fn) {
461 fp = kftp_parse_url(fn, mode);
462 if (fp == 0) return 0;
463 if (kftp_connect(fp) == -1) {
467 kftp_connect_file(fp);
468 } else if (strstr(fn, "http://") == fn) {
469 fp = khttp_parse_url(fn, mode);
470 if (fp == 0) return 0;
471 khttp_connect_file(fp);
472 } else { // local file
474 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
475 * be undefined on some systems, although it is defined on my
476 * Mac and the Linux I have tested on. */
477 int fd = open(fn, O_RDONLY | O_BINARY);
479 int fd = open(fn, O_RDONLY);
485 fp = (knetFile*)calloc(1, sizeof(knetFile));
486 fp->type = KNF_TYPE_LOCAL;
490 if (fp && fp->fd == -1) {
497 knetFile *knet_dopen(int fd, const char *mode)
499 knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
500 fp->type = KNF_TYPE_LOCAL;
505 off_t knet_read(knetFile *fp, void *buf, off_t len)
508 if (fp->fd == -1) return 0;
509 if (fp->type == KNF_TYPE_FTP) {
510 if (fp->is_ready == 0) {
511 if (!fp->no_reconnect) kftp_reconnect(fp);
512 kftp_connect_file(fp);
514 } else if (fp->type == KNF_TYPE_HTTP) {
515 if (fp->is_ready == 0)
516 khttp_connect_file(fp);
518 if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
519 off_t rest = len, curr;
522 curr = read(fp->fd, buf + l, rest);
523 } while (curr < 0 && EINTR == errno);
524 if (curr < 0) return -1;
525 if (curr == 0) break;
526 l += curr; rest -= curr;
528 } else l = my_netread(fp->fd, buf, len);
533 off_t knet_seek(knetFile *fp, int64_t off, int whence)
535 if (whence == SEEK_SET && off == fp->offset) return 0;
536 if (fp->type == KNF_TYPE_LOCAL) {
537 /* Be aware that lseek() returns the offset after seeking,
538 * while fseek() returns zero on success. */
539 off_t offset = lseek(fp->fd, off, whence);
541 // Be silent, it is OK for knet_seek to fail when the file is streamed
542 // fprintf(pysamerr,"[knet_seek] %s\n", strerror(errno));
548 else if (fp->type == KNF_TYPE_FTP)
550 if (whence==SEEK_CUR)
552 else if (whence==SEEK_SET)
554 else if ( whence==SEEK_END)
555 fp->offset = fp->file_size+off;
559 else if (fp->type == KNF_TYPE_HTTP)
561 if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
562 fprintf(pysamerr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
566 if (whence==SEEK_CUR)
568 else if (whence==SEEK_SET)
574 fprintf(pysamerr,"[knet_seek] %s\n", strerror(errno));
578 int knet_close(knetFile *fp)
580 if (fp == 0) return 0;
581 if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
583 /* On Linux/Mac, netclose() is an alias of close(), but on
584 * Windows, it is an alias of closesocket(). */
585 if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
586 else netclose(fp->fd);
588 free(fp->host); free(fp->port);
589 free(fp->response); free(fp->retr); // FTP specific
590 free(fp->path); free(fp->http_host); // HTTP specific
604 buf = calloc(0x100000, 1);
606 fp = knet_open("knetfile.c", "r");
607 knet_seek(fp, 1000, SEEK_SET);
608 } else if (type == 1) { // NCBI FTP, large file
609 fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
610 knet_seek(fp, 2500000000ll, SEEK_SET);
611 l = knet_read(fp, buf, 255);
612 } else if (type == 2) {
613 fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
614 knet_seek(fp, 1000, SEEK_SET);
615 } else if (type == 3) {
616 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
617 knet_seek(fp, 1000, SEEK_SET);
618 } else if (type == 4) {
619 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
620 knet_read(fp, buf, 10000);
621 knet_seek(fp, 20000, SEEK_SET);
622 knet_seek(fp, 10000, SEEK_SET);
623 l = knet_read(fp, buf+10000, 10000000) + 10000;
625 if (type != 4 && type != 1) {
626 knet_read(fp, buf, 255);
629 } else write(fileno(stdout), buf, l);