2
0

crawler.c 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. /***************************************************************************
  2. * _ _ ____ _
  3. * Project ___| | | | _ \| |
  4. * / __| | | | |_) | |
  5. * | (__| |_| | _ <| |___
  6. * \___|\___/|_| \_\_____|
  7. *
  8. * Copyright (C) Jeroen Ooms <jeroenooms@gmail.com>
  9. *
  10. * This software is licensed as described in the file COPYING, which
  11. * you should have received as part of this distribution. The terms
  12. * are also available at https://curl.se/docs/copyright.html.
  13. *
  14. * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  15. * copies of the Software, and permit persons to whom the Software is
  16. * furnished to do so, under the terms of the COPYING file.
  17. *
  18. * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  19. * KIND, either express or implied.
  20. *
  21. * SPDX-License-Identifier: curl
  22. *
  23. * To compile:
  24. * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
  25. *
  26. */
  27. /* <DESC>
  28. * Web crawler based on curl and libxml2 to stress-test curl with
  29. * hundreds of concurrent connections to various servers.
  30. * </DESC>
  31. */
  32. #include <libxml/HTMLparser.h>
  33. #include <libxml/xpath.h>
  34. #include <libxml/uri.h>
  35. #include <curl/curl.h>
  36. #include <stdlib.h>
  37. #include <string.h>
  38. #include <math.h>
  39. #include <signal.h>
  40. /* Parameters */
  41. static int max_con = 200;
  42. static int max_total = 20000;
  43. static int max_requests = 500;
  44. static size_t max_link_per_page = 5;
  45. static int follow_relative_links = 0;
  46. static const char *start_page = "https://www.reuters.com";
  47. static int pending_interrupt = 0;
  48. static void sighandler(int dummy)
  49. {
  50. (void)dummy;
  51. pending_interrupt = 1;
  52. }
  53. /* resizable buffer */
  54. typedef struct {
  55. char *buf;
  56. size_t size;
  57. } memory;
  58. static size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
  59. {
  60. size_t realsize = sz * nmemb;
  61. memory *mem = (memory*) ctx;
  62. char *ptr = realloc(mem->buf, mem->size + realsize);
  63. if(!ptr) {
  64. /* out of memory */
  65. printf("not enough memory (realloc returned NULL)\n");
  66. return 0;
  67. }
  68. mem->buf = ptr;
  69. memcpy(&(mem->buf[mem->size]), contents, realsize);
  70. mem->size += realsize;
  71. return realsize;
  72. }
  73. static CURL *make_handle(const char *url)
  74. {
  75. CURL *handle = curl_easy_init();
  76. memory *mem;
  77. /* Important: use HTTP2 over HTTPS */
  78. curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
  79. curl_easy_setopt(handle, CURLOPT_URL, url);
  80. /* buffer body */
  81. mem = malloc(sizeof(memory));
  82. mem->size = 0;
  83. mem->buf = malloc(1);
  84. curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
  85. curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
  86. curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
  87. /* For completeness */
  88. curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
  89. curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
  90. curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
  91. /* only allow redirects to HTTP and HTTPS URLs */
  92. curl_easy_setopt(handle, CURLOPT_REDIR_PROTOCOLS_STR, "http,https");
  93. curl_easy_setopt(handle, CURLOPT_AUTOREFERER, 1L);
  94. curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
  95. /* each transfer needs to be done within 20 seconds! */
  96. curl_easy_setopt(handle, CURLOPT_TIMEOUT_MS, 20000L);
  97. /* connect fast or fail */
  98. curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT_MS, 2000L);
  99. /* skip files larger than a gigabyte */
  100. curl_easy_setopt(handle, CURLOPT_MAXFILESIZE_LARGE,
  101. (curl_off_t)1024*1024*1024);
  102. curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
  103. curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
  104. curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
  105. curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
  106. curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
  107. curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
  108. curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
  109. return handle;
  110. }
  111. /* HREF finder implemented in libxml2 but could be any HTML parser */
  112. static size_t follow_links(CURLM *multi_handle, memory *mem, const char *url)
  113. {
  114. int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
  115. HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
  116. htmlDocPtr doc = htmlReadMemory(mem->buf, (int)mem->size, url, NULL, opts);
  117. size_t count;
  118. int i;
  119. xmlChar *xpath;
  120. xmlNodeSetPtr nodeset;
  121. xmlXPathContextPtr context;
  122. xmlXPathObjectPtr result;
  123. if(!doc)
  124. return 0;
  125. xpath = (xmlChar*) "//a/@href";
  126. context = xmlXPathNewContext(doc);
  127. result = xmlXPathEvalExpression(xpath, context);
  128. xmlXPathFreeContext(context);
  129. if(!result)
  130. return 0;
  131. nodeset = result->nodesetval;
  132. if(xmlXPathNodeSetIsEmpty(nodeset)) {
  133. xmlXPathFreeObject(result);
  134. return 0;
  135. }
  136. count = 0;
  137. for(i = 0; i < nodeset->nodeNr; i++) {
  138. double r = rand();
  139. int x = (int)(r * nodeset->nodeNr / RAND_MAX);
  140. const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
  141. xmlChar *href = xmlNodeListGetString(doc, node, 1);
  142. char *link;
  143. if(follow_relative_links) {
  144. xmlChar *orig = href;
  145. href = xmlBuildURI(href, (xmlChar *) url);
  146. xmlFree(orig);
  147. }
  148. link = (char *) href;
  149. if(!link || strlen(link) < 20)
  150. continue;
  151. if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
  152. curl_multi_add_handle(multi_handle, make_handle(link));
  153. if(count++ == max_link_per_page)
  154. break;
  155. }
  156. xmlFree(link);
  157. }
  158. xmlXPathFreeObject(result);
  159. return count;
  160. }
  161. static int is_html(char *ctype)
  162. {
  163. return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
  164. }
  165. int main(void)
  166. {
  167. CURLM *multi_handle;
  168. int msgs_left;
  169. int pending;
  170. int complete;
  171. int still_running;
  172. signal(SIGINT, sighandler);
  173. LIBXML_TEST_VERSION
  174. curl_global_init(CURL_GLOBAL_DEFAULT);
  175. multi_handle = curl_multi_init();
  176. curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
  177. curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
  178. /* enables http/2 if available */
  179. #ifdef CURLPIPE_MULTIPLEX
  180. curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
  181. #endif
  182. /* sets html start page */
  183. curl_multi_add_handle(multi_handle, make_handle(start_page));
  184. pending = 0;
  185. complete = 0;
  186. still_running = 1;
  187. while(still_running && !pending_interrupt) {
  188. int numfds;
  189. CURLMsg *m;
  190. curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
  191. curl_multi_perform(multi_handle, &still_running);
  192. /* See how the transfers went */
  193. m = NULL;
  194. while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
  195. if(m->msg == CURLMSG_DONE) {
  196. CURL *handle = m->easy_handle;
  197. char *url;
  198. memory *mem;
  199. curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
  200. curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
  201. if(m->data.result == CURLE_OK) {
  202. long res_status;
  203. curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
  204. if(res_status == 200) {
  205. char *ctype;
  206. curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
  207. printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
  208. if(is_html(ctype) && mem->size > 100) {
  209. if(pending < max_requests && (complete + pending) < max_total) {
  210. pending += follow_links(multi_handle, mem, url);
  211. still_running = 1;
  212. }
  213. }
  214. }
  215. else {
  216. printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
  217. }
  218. }
  219. else {
  220. printf("[%d] Connection failure: %s\n", complete, url);
  221. }
  222. curl_multi_remove_handle(multi_handle, handle);
  223. curl_easy_cleanup(handle);
  224. free(mem->buf);
  225. free(mem);
  226. complete++;
  227. pending--;
  228. }
  229. }
  230. }
  231. curl_multi_cleanup(multi_handle);
  232. curl_global_cleanup();
  233. return 0;
  234. }