3 Copyright (c) 2008 by Genome Research Ltd (GRL).
4 2010 by Attractive Chaos <attractor@live.co.uk>
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice shall be
15 included in all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 /* Probably I will not do socket programming in the next few years and
28 therefore I decide to heavily annotate this file, for Linux and
29 Windows as well. -ac */
38 #include <sys/types.h>
42 #include <arpa/inet.h>
43 #include <sys/socket.h>
48 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
49 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
50 * integer -1. In knetfile.c, I use "int" for socket type
51 * throughout. This should be improved to avoid confusion.
53 * In Linux/Mac, recv() and read() do almost the same thing. You can see
54 * in the header file that netread() is simply an alias of read(). In
55 * Windows, however, they are different and using recv() is mandatory.
58 /* This function tests if the file handler is ready for reading (or
59 * writing if is_read==0). */
60 static int socket_wait(int fd, int is_read)
62 fd_set fds, *fdr = 0, *fdw = 0;
65 tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
68 if (is_read) fdr = &fds;
70 ret = select(fd+1, fdr, fdw, 0, &tv);
72 if (ret == -1) perror("select");
75 fprintf(stderr, "select time-out\n");
76 else if (ret == SOCKET_ERROR)
77 fprintf(stderr, "select: %d\n", WSAGetLastError());
83 /* This function does not work with Windows due to the lack of
84 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
85 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
86 static int socket_connect(const char *host, const char *port)
88 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
91 struct linger lng = { 0, 0 };
92 struct addrinfo hints, *res = 0;
93 memset(&hints, 0, sizeof(struct addrinfo));
94 hints.ai_family = AF_UNSPEC;
95 hints.ai_socktype = SOCK_STREAM;
96 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
97 * server information. */
98 if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
99 if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
100 /* The following two setsockopt() are used by ftplib
101 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
103 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
104 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
105 if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
110 /* MinGW's printf has problem with "%lld" */
111 char *int64tostr(char *buf, int64_t x)
116 buf[i++] = '0' + x % 10;
120 for (cnt = i, i = 0; i < cnt/2; ++i) {
121 int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
126 int64_t strtoint64(const char *buf)
129 for (x = 0; *buf != '\0'; ++buf)
130 x = x * 10 + ((int64_t) *buf - 48);
133 /* In windows, the first thing is to establish the TCP connection. */
134 int knet_win32_init()
137 return WSAStartup(MAKEWORD(2, 2), &wsaData);
139 void knet_win32_destroy()
143 /* A slightly modfied version of the following function also works on
144 * Mac (and presummably Linux). However, this function is not stable on
145 * my Mac. It sometimes works fine but sometimes does not. Therefore for
146 * non-Windows OS, I do not use this one. */
147 static SOCKET socket_connect(const char *host, const char *port)
149 #define __err_connect(func) \
151 fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \
157 struct linger lng = { 0, 0 };
158 struct sockaddr_in server;
159 struct hostent *hp = 0;
161 if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
162 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
163 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
165 if (isalpha(host[0])) hp = gethostbyname(host);
168 addr.s_addr = inet_addr(host);
169 hp = gethostbyaddr((char*)&addr, 4, AF_INET);
171 if (hp == 0) __err_connect("gethost");
173 server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
174 server.sin_family= AF_INET;
175 server.sin_port = htons(atoi(port));
176 if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
177 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
182 static off_t my_netread(int fd, void *buf, off_t len)
184 off_t rest = len, curr, l = 0;
185 /* recv() and read() may not read the required length of data with
186 * one call. They have to be called repeatedly. */
188 if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
189 curr = netread(fd, buf + l, rest);
190 /* According to the glibc manual, section 13.2, a zero returned
191 * value indicates end-of-file (EOF), which should mean that
192 * read() will not return zero if EOF has not been met but data
193 * are not immediately available. */
194 if (curr == 0) break;
195 l += curr; rest -= curr;
200 /*************************
201 * FTP specific routines *
202 *************************/
204 static int kftp_get_response(knetFile *ftp)
213 if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
214 while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
216 if (n >= ftp->max_response) {
217 ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
218 ftp->response = realloc(ftp->response, ftp->max_response);
220 ftp->response[n++] = c;
222 if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
223 && ftp->response[3] != '-') break;
228 if (n < 2) return -1;
229 ftp->response[n-2] = 0;
230 return strtol(ftp->response, &p, 0);
233 static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
235 if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
236 if (netwrite(ftp->ctrl_fd, cmd, strlen(cmd)) < 0) {
237 fprintf(stderr, "[%s] error sending command to socket.\n", __func__);
240 return is_get? kftp_get_response(ftp) : 0;
243 static int kftp_pasv_prep(knetFile *ftp)
247 kftp_send_cmd(ftp, "PASV\r\n", 1);
248 for (p = ftp->response; *p && *p != '('; ++p);
249 if (*p != '(') return -1;
251 sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
252 memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
253 ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
258 static int kftp_pasv_connect(knetFile *ftp)
260 char host[80], port[10];
261 if (ftp->pasv_port == 0) {
262 fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
265 sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
266 sprintf(port, "%d", ftp->pasv_port);
267 ftp->fd = socket_connect(host, port);
268 if (ftp->fd == -1) return -1;
272 int kftp_connect(knetFile *ftp)
274 ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
275 if (ftp->ctrl_fd == -1) return -1;
276 kftp_get_response(ftp);
277 kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
278 kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
279 kftp_send_cmd(ftp, "TYPE I\r\n", 1);
283 int kftp_reconnect(knetFile *ftp)
285 if (ftp->ctrl_fd != -1) {
286 netclose(ftp->ctrl_fd);
291 return kftp_connect(ftp);
294 // initialize ->type, ->host, ->retr and ->size
295 knetFile *kftp_parse_url(const char *fn, const char *mode)
300 if (strstr(fn, "ftp://") != fn) return 0;
301 for (p = (char*)fn + 6; *p && *p != '/'; ++p);
302 if (*p != '/') return 0;
304 fp = calloc(1, sizeof(knetFile));
305 fp->type = KNF_TYPE_FTP;
307 /* the Linux/Mac version of socket_connect() also recognizes a port
308 * like "ftp", but the Windows version does not. */
309 fp->port = strdup("21");
310 fp->host = calloc(l + 1, 1);
311 if (strchr(mode, 'c')) fp->no_reconnect = 1;
312 strncpy(fp->host, fn + 6, l);
313 fp->retr = calloc(strlen(p) + 8, 1);
314 sprintf(fp->retr, "RETR %s\r\n", p);
315 fp->size_cmd = calloc(strlen(p) + 8, 1);
316 sprintf(fp->size_cmd, "SIZE %s\r\n", p);
320 // place ->fd at offset off
321 int kftp_connect_file(knetFile *fp)
327 if (fp->no_reconnect) kftp_get_response(fp);
330 kftp_send_cmd(fp, fp->size_cmd, 1);
332 if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
334 fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
338 const char *p = fp->response;
339 while (*p != ' ') ++p;
340 while (*p < '0' || *p > '9') ++p;
341 file_size = strtoint64(p);
343 fp->file_size = file_size;
347 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
349 strcpy(tmp, "REST ");
350 int64tostr(tmp + 5, fp->offset);
353 kftp_send_cmd(fp, tmp, 1);
355 kftp_send_cmd(fp, fp->retr, 0);
356 kftp_pasv_connect(fp);
357 ret = kftp_get_response(fp);
359 fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
369 /**************************
370 * HTTP specific routines *
371 **************************/
373 knetFile *khttp_parse_url(const char *fn, const char *mode)
378 if (strstr(fn, "http://") != fn) return 0;
380 for (p = (char*)fn + 7; *p && *p != '/'; ++p);
382 fp = calloc(1, sizeof(knetFile));
383 fp->http_host = calloc(l + 1, 1);
384 strncpy(fp->http_host, fn + 7, l);
385 fp->http_host[l] = 0;
386 for (q = fp->http_host; *q && *q != ':'; ++q);
387 if (*q == ':') *q++ = 0;
389 proxy = getenv("http_proxy");
390 // set ->host, ->port and ->path
392 fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
393 fp->port = strdup(*q? q : "80");
394 fp->path = strdup(*p? p : "/");
396 fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
397 for (q = fp->host; *q && *q != ':'; ++q);
398 if (*q == ':') *q++ = 0;
399 fp->port = strdup(*q? q : "80");
400 fp->path = strdup(fn);
402 fp->type = KNF_TYPE_HTTP;
403 fp->ctrl_fd = fp->fd = -1;
408 int khttp_connect_file(knetFile *fp)
412 if (fp->fd != -1) netclose(fp->fd);
413 fp->fd = socket_connect(fp->host, fp->port);
414 buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
415 l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
416 l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
417 l += sprintf(buf + l, "\r\n");
418 if (netwrite(fp->fd, buf, l) < 0) {
419 fprintf(stderr, "[%s] fail write GET line.\n", __func__);
423 while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
424 if (buf[l] == '\n' && l >= 3)
425 if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
429 if (l < 14) { // prematured header
434 ret = strtol(buf + 8, &p, 0); // HTTP return code
435 if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
436 off_t rest = fp->offset;
438 off_t l = rest < 0x10000? rest : 0x10000;
439 rest -= my_netread(fp->fd, buf, l);
441 } else if (ret != 206 && ret != 200) {
443 fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
453 /********************
455 ********************/
457 knetFile *knet_open(const char *fn, const char *mode)
460 if (mode[0] != 'r') {
461 fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
464 if (strstr(fn, "ftp://") == fn) {
465 fp = kftp_parse_url(fn, mode);
466 if (fp == 0) return 0;
467 if (kftp_connect(fp) == -1) {
471 kftp_connect_file(fp);
472 } else if (strstr(fn, "http://") == fn) {
473 fp = khttp_parse_url(fn, mode);
474 if (fp == 0) return 0;
475 khttp_connect_file(fp);
476 } else { // local file
478 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
479 * be undefined on some systems, although it is defined on my
480 * Mac and the Linux I have tested on. */
481 int fd = open(fn, O_RDONLY | O_BINARY);
483 int fd = open(fn, O_RDONLY);
489 fp = (knetFile*)calloc(1, sizeof(knetFile));
490 fp->type = KNF_TYPE_LOCAL;
494 if (fp && fp->fd == -1) {
501 knetFile *knet_dopen(int fd, const char *mode)
503 knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
504 fp->type = KNF_TYPE_LOCAL;
509 off_t knet_read(knetFile *fp, void *buf, off_t len)
512 if (fp->fd == -1) return 0;
513 if (fp->type == KNF_TYPE_FTP) {
514 if (fp->is_ready == 0) {
515 if (!fp->no_reconnect) kftp_reconnect(fp);
516 kftp_connect_file(fp);
518 } else if (fp->type == KNF_TYPE_HTTP) {
519 if (fp->is_ready == 0)
520 khttp_connect_file(fp);
522 if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
523 off_t rest = len, curr;
526 curr = read(fp->fd, buf + l, rest);
527 } while (curr < 0 && EINTR == errno);
528 if (curr < 0) return -1;
529 if (curr == 0) break;
530 l += curr; rest -= curr;
532 } else l = my_netread(fp->fd, buf, len);
537 off_t knet_seek(knetFile *fp, int64_t off, int whence)
539 if (whence == SEEK_SET && off == fp->offset) return 0;
540 if (fp->type == KNF_TYPE_LOCAL) {
541 /* Be aware that lseek() returns the offset after seeking,
542 * while fseek() returns zero on success. */
543 off_t offset = lseek(fp->fd, off, whence);
545 // Be silent, it is OK for knet_seek to fail when the file is streamed
546 // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
552 else if (fp->type == KNF_TYPE_FTP)
554 if (whence==SEEK_CUR)
556 else if (whence==SEEK_SET)
558 else if ( whence==SEEK_END)
559 fp->offset = fp->file_size+off;
563 else if (fp->type == KNF_TYPE_HTTP)
565 if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
566 fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
570 if (whence==SEEK_CUR)
572 else if (whence==SEEK_SET)
578 fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
582 int knet_close(knetFile *fp)
584 if (fp == 0) return 0;
585 if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
587 /* On Linux/Mac, netclose() is an alias of close(), but on
588 * Windows, it is an alias of closesocket(). */
589 if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
590 else netclose(fp->fd);
592 free(fp->host); free(fp->port);
593 free(fp->response); free(fp->retr); // FTP specific
594 free(fp->path); free(fp->http_host); // HTTP specific
608 buf = calloc(0x100000, 1);
610 fp = knet_open("knetfile.c", "r");
611 knet_seek(fp, 1000, SEEK_SET);
612 } else if (type == 1) { // NCBI FTP, large file
613 fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
614 knet_seek(fp, 2500000000ll, SEEK_SET);
615 l = knet_read(fp, buf, 255);
616 } else if (type == 2) {
617 fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
618 knet_seek(fp, 1000, SEEK_SET);
619 } else if (type == 3) {
620 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
621 knet_seek(fp, 1000, SEEK_SET);
622 } else if (type == 4) {
623 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
624 knet_read(fp, buf, 10000);
625 knet_seek(fp, 20000, SEEK_SET);
626 knet_seek(fp, 10000, SEEK_SET);
627 l = knet_read(fp, buf+10000, 10000000) + 10000;
629 if (type != 4 && type != 1) {
630 knet_read(fp, buf, 255);
633 } else write(fileno(stdout), buf, l);