fs_sharetree.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. /*
  2. This file is part of GNUnet
  3. Copyright (C) 2005-2012 GNUnet e.V.
  4. GNUnet is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU Affero General Public License as published
  6. by the Free Software Foundation, either version 3 of the License,
  7. or (at your option) any later version.
  8. GNUnet is distributed in the hope that it will be useful, but
  9. WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Affero General Public License for more details.
  12. You should have received a copy of the GNU Affero General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>.
  14. SPDX-License-Identifier: AGPL3.0-or-later
  15. */
  16. /**
  17. * @file fs/fs_sharetree.c
  18. * @brief code to manipulate the 'struct GNUNET_FS_ShareTreeItem' tree
  19. * @author LRN
  20. * @author Christian Grothoff
  21. */
  22. #include "platform.h"
  23. #include "gnunet_fs_service.h"
  24. #include "gnunet_scheduler_lib.h"
  25. #include <pthread.h>
  26. /**
  27. * Entry for each unique keyword to track how often
  28. * it occured. Contains the keyword and the counter.
  29. */
  30. struct KeywordCounter
  31. {
  32. /**
  33. * This is a doubly-linked list
  34. */
  35. struct KeywordCounter *prev;
  36. /**
  37. * This is a doubly-linked list
  38. */
  39. struct KeywordCounter *next;
  40. /**
  41. * Keyword that was found.
  42. */
  43. const char *value;
  44. /**
  45. * How many files have this keyword?
  46. */
  47. unsigned int count;
  48. };
  49. /**
  50. * Aggregate information we keep for meta data in each directory.
  51. */
  52. struct MetaCounter
  53. {
  54. /**
  55. * This is a doubly-linked list
  56. */
  57. struct MetaCounter *prev;
  58. /**
  59. * This is a doubly-linked list
  60. */
  61. struct MetaCounter *next;
  62. /**
  63. * Name of the plugin that provided that piece of metadata
  64. */
  65. const char *plugin_name;
  66. /**
  67. * MIME-type of the metadata itself
  68. */
  69. const char *data_mime_type;
  70. /**
  71. * The actual meta data.
  72. */
  73. const char *data;
  74. /**
  75. * Number of bytes in 'data'.
  76. */
  77. size_t data_size;
  78. /**
  79. * Type of the data
  80. */
  81. enum EXTRACTOR_MetaType type;
  82. /**
  83. * Format of the data
  84. */
  85. enum EXTRACTOR_MetaFormat format;
  86. /**
  87. * How many files have meta entries matching this value?
  88. * (type and format do not have to match).
  89. */
  90. unsigned int count;
  91. };
  92. /**
  93. * A structure that forms a singly-linked list that serves as a stack
  94. * for metadata-processing function.
  95. */
  96. struct TrimContext
  97. {
  98. /**
  99. * Map from the hash over the keyword to an 'struct KeywordCounter *'
  100. * counter that says how often this keyword was
  101. * encountered in the current directory.
  102. */
  103. struct GNUNET_CONTAINER_MultiHashMap *keywordcounter;
  104. /**
  105. * Map from the hash over the metadata to an 'struct MetaCounter *'
  106. * counter that says how often this metadata was
  107. * encountered in the current directory.
  108. */
  109. struct GNUNET_CONTAINER_MultiHashMap *metacounter;
  110. /**
  111. * Position we are currently manipulating.
  112. */
  113. struct GNUNET_FS_ShareTreeItem *pos;
  114. /**
  115. * Number of times an item has to be found to be moved to the parent.
  116. */
  117. unsigned int move_threshold;
  118. };
  119. /**
  120. * Add the given keyword to the keyword statistics tracker.
  121. *
  122. * @param cls the multihashmap we store the keyword counters in
  123. * @param keyword the keyword to count
  124. * @param is_mandatory ignored
  125. * @return always GNUNET_OK
  126. */
  127. static int
  128. add_to_keyword_counter (void *cls, const char *keyword, int is_mandatory)
  129. {
  130. struct GNUNET_CONTAINER_MultiHashMap *mcm = cls;
  131. struct KeywordCounter *cnt;
  132. struct GNUNET_HashCode hc;
  133. size_t klen;
  134. klen = strlen (keyword) + 1;
  135. GNUNET_CRYPTO_hash (keyword, klen - 1, &hc);
  136. cnt = GNUNET_CONTAINER_multihashmap_get (mcm, &hc);
  137. if (cnt == NULL)
  138. {
  139. cnt = GNUNET_malloc (sizeof(struct KeywordCounter) + klen);
  140. cnt->value = (const char *) &cnt[1];
  141. GNUNET_memcpy (&cnt[1], keyword, klen);
  142. GNUNET_assert (GNUNET_OK ==
  143. GNUNET_CONTAINER_multihashmap_put (mcm,
  144. &hc, cnt,
  145. GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY));
  146. }
  147. cnt->count++;
  148. return GNUNET_OK;
  149. }
  150. /**
  151. * Function called on each meta data item. Increments the
  152. * respective counter.
  153. *
  154. * @param cls the container multihashmap to update
  155. * @param plugin_name name of the plugin that produced this value;
  156. * special values can be used (i.e. '&lt;zlib&gt;' for zlib being
  157. * used in the main libextractor library and yielding
  158. * meta data).
  159. * @param type libextractor-type describing the meta data
  160. * @param format basic format information about data
  161. * @param data_mime_type mime-type of data (not of the original file);
  162. * can be NULL (if mime-type is not known)
  163. * @param data actual meta-data found
  164. * @param data_len number of bytes in data
  165. * @return 0 to continue extracting / iterating
  166. */
  167. static int
  168. add_to_meta_counter (void *cls, const char *plugin_name,
  169. enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat
  170. format,
  171. const char *data_mime_type, const char *data, size_t
  172. data_len)
  173. {
  174. struct GNUNET_CONTAINER_MultiHashMap *map = cls;
  175. struct GNUNET_HashCode key;
  176. struct MetaCounter *cnt;
  177. GNUNET_CRYPTO_hash (data, data_len, &key);
  178. cnt = GNUNET_CONTAINER_multihashmap_get (map, &key);
  179. if (NULL == cnt)
  180. {
  181. cnt = GNUNET_new (struct MetaCounter);
  182. cnt->data = data;
  183. cnt->data_size = data_len;
  184. cnt->plugin_name = plugin_name;
  185. cnt->type = type;
  186. cnt->format = format;
  187. cnt->data_mime_type = data_mime_type;
  188. GNUNET_assert (GNUNET_OK ==
  189. GNUNET_CONTAINER_multihashmap_put (map,
  190. &key, cnt,
  191. GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY));
  192. }
  193. cnt->count++;
  194. return 0;
  195. }
  196. /**
  197. * Remove keywords above the threshold.
  198. *
  199. * @param cls the 'struct TrimContext' with the pos to remove the keywords from
  200. * @param keyword the keyword to check
  201. * @param is_mandatory ignored
  202. * @return always GNUNET_OK
  203. */
  204. static int
  205. remove_high_frequency_keywords (void *cls, const char *keyword, int
  206. is_mandatory)
  207. {
  208. struct TrimContext *tc = cls;
  209. struct KeywordCounter *counter;
  210. struct GNUNET_HashCode hc;
  211. size_t klen;
  212. klen = strlen (keyword) + 1;
  213. GNUNET_CRYPTO_hash (keyword, klen - 1, &hc);
  214. counter = GNUNET_CONTAINER_multihashmap_get (tc->keywordcounter, &hc);
  215. GNUNET_assert (NULL != counter);
  216. if (counter->count < tc->move_threshold)
  217. return GNUNET_OK;
  218. GNUNET_FS_uri_ksk_remove_keyword (tc->pos->ksk_uri,
  219. counter->value);
  220. return GNUNET_OK;
  221. }
  222. /**
  223. * Move "frequent" keywords over to the target ksk uri, free the
  224. * counters.
  225. *
  226. * @param cls the 'struct TrimContext'
  227. * @param key key of the entry
  228. * @param value the 'struct KeywordCounter'
  229. * @return GNUNET_YES (always)
  230. */
  231. static int
  232. migrate_and_drop_keywords (void *cls, const struct GNUNET_HashCode *key,
  233. void *value)
  234. {
  235. struct TrimContext *tc = cls;
  236. struct KeywordCounter *counter = value;
  237. if (counter->count >= tc->move_threshold)
  238. {
  239. if (NULL == tc->pos->ksk_uri)
  240. tc->pos->ksk_uri = GNUNET_FS_uri_ksk_create_from_args (1,
  241. &counter->value);
  242. else
  243. GNUNET_FS_uri_ksk_add_keyword (tc->pos->ksk_uri, counter->value,
  244. GNUNET_NO);
  245. }
  246. GNUNET_assert (GNUNET_YES ==
  247. GNUNET_CONTAINER_multihashmap_remove (tc->keywordcounter,
  248. key,
  249. counter));
  250. GNUNET_free (counter);
  251. return GNUNET_YES;
  252. }
  253. /**
  254. * Copy "frequent" metadata items over to the
  255. * target metadata container, free the counters.
  256. *
  257. * @param cls the 'struct TrimContext'
  258. * @param key key of the entry
  259. * @param value the 'struct KeywordCounter'
  260. * @return GNUNET_YES (always)
  261. */
  262. static int
  263. migrate_and_drop_metadata (void *cls, const struct GNUNET_HashCode *key,
  264. void *value)
  265. {
  266. struct TrimContext *tc = cls;
  267. struct MetaCounter *counter = value;
  268. if (counter->count >= tc->move_threshold)
  269. {
  270. if (NULL == tc->pos->meta)
  271. tc->pos->meta = GNUNET_CONTAINER_meta_data_create ();
  272. GNUNET_CONTAINER_meta_data_insert (tc->pos->meta,
  273. counter->plugin_name,
  274. counter->type,
  275. counter->format,
  276. counter->data_mime_type, counter->data,
  277. counter->data_size);
  278. }
  279. GNUNET_assert (GNUNET_YES ==
  280. GNUNET_CONTAINER_multihashmap_remove (tc->metacounter,
  281. key,
  282. counter));
  283. GNUNET_free (counter);
  284. return GNUNET_YES;
  285. }
  286. /**
  287. * Process a share item tree, moving frequent keywords up and
  288. * copying frequent metadata up.
  289. *
  290. * @param tc trim context with hash maps to use
  291. * @param tree tree to trim
  292. */
  293. static void
  294. share_tree_trim (struct TrimContext *tc,
  295. struct GNUNET_FS_ShareTreeItem *tree)
  296. {
  297. struct GNUNET_FS_ShareTreeItem *pos;
  298. unsigned int num_children;
  299. /* first, trim all children */
  300. num_children = 0;
  301. for (pos = tree->children_head; NULL != pos; pos = pos->next)
  302. {
  303. share_tree_trim (tc, pos);
  304. num_children++;
  305. }
  306. /* consider adding filename to directory meta data */
  307. if (tree->is_directory == GNUNET_YES)
  308. {
  309. const char *user = getenv ("USER");
  310. if ((user == NULL) ||
  311. (0 != strncasecmp (user, tree->short_filename, strlen (user))))
  312. {
  313. /* only use filename if it doesn't match $USER */
  314. if (NULL == tree->meta)
  315. tree->meta = GNUNET_CONTAINER_meta_data_create ();
  316. GNUNET_CONTAINER_meta_data_insert (tree->meta, "<libgnunetfs>",
  317. EXTRACTOR_METATYPE_GNUNET_ORIGINAL_FILENAME,
  318. EXTRACTOR_METAFORMAT_UTF8,
  319. "text/plain", tree->short_filename,
  320. strlen (tree->short_filename) + 1);
  321. }
  322. }
  323. if (1 >= num_children)
  324. return; /* nothing to trim */
  325. /* now, count keywords and meta data in children */
  326. for (pos = tree->children_head; NULL != pos; pos = pos->next)
  327. {
  328. if (NULL != pos->meta)
  329. GNUNET_CONTAINER_meta_data_iterate (pos->meta, &add_to_meta_counter,
  330. tc->metacounter);
  331. if (NULL != pos->ksk_uri)
  332. GNUNET_FS_uri_ksk_get_keywords (pos->ksk_uri, &add_to_keyword_counter,
  333. tc->keywordcounter);
  334. }
  335. /* calculate threshold for moving keywords / meta data */
  336. tc->move_threshold = 1 + (num_children / 2);
  337. /* remove high-frequency keywords from children */
  338. for (pos = tree->children_head; NULL != pos; pos = pos->next)
  339. {
  340. tc->pos = pos;
  341. if (NULL != pos->ksk_uri)
  342. {
  343. struct GNUNET_FS_Uri *ksk_uri_copy = GNUNET_FS_uri_dup (pos->ksk_uri);
  344. GNUNET_FS_uri_ksk_get_keywords (ksk_uri_copy,
  345. &remove_high_frequency_keywords, tc);
  346. GNUNET_FS_uri_destroy (ksk_uri_copy);
  347. }
  348. }
  349. /* add high-frequency meta data and keywords to parent */
  350. tc->pos = tree;
  351. GNUNET_CONTAINER_multihashmap_iterate (tc->keywordcounter,
  352. &migrate_and_drop_keywords,
  353. tc);
  354. GNUNET_CONTAINER_multihashmap_iterate (tc->metacounter,
  355. &migrate_and_drop_metadata,
  356. tc);
  357. }
  358. /**
  359. * Process a share item tree, moving frequent keywords up and
  360. * copying frequent metadata up.
  361. *
  362. * @param toplevel toplevel directory in the tree, returned by the scanner
  363. */
  364. void
  365. GNUNET_FS_share_tree_trim (struct GNUNET_FS_ShareTreeItem *toplevel)
  366. {
  367. struct TrimContext tc;
  368. if (toplevel == NULL)
  369. return;
  370. tc.keywordcounter = GNUNET_CONTAINER_multihashmap_create (1024, GNUNET_NO);
  371. tc.metacounter = GNUNET_CONTAINER_multihashmap_create (1024, GNUNET_NO);
  372. share_tree_trim (&tc, toplevel);
  373. GNUNET_CONTAINER_multihashmap_destroy (tc.keywordcounter);
  374. GNUNET_CONTAINER_multihashmap_destroy (tc.metacounter);
  375. }
  376. /**
  377. * Release memory of a share item tree.
  378. *
  379. * @param toplevel toplevel of the tree to be freed
  380. */
  381. void
  382. GNUNET_FS_share_tree_free (struct GNUNET_FS_ShareTreeItem *toplevel)
  383. {
  384. struct GNUNET_FS_ShareTreeItem *pos;
  385. while (NULL != (pos = toplevel->children_head))
  386. GNUNET_FS_share_tree_free (pos);
  387. if (NULL != toplevel->parent)
  388. GNUNET_CONTAINER_DLL_remove (toplevel->parent->children_head,
  389. toplevel->parent->children_tail,
  390. toplevel);
  391. if (NULL != toplevel->meta)
  392. GNUNET_CONTAINER_meta_data_destroy (toplevel->meta);
  393. if (NULL != toplevel->ksk_uri)
  394. GNUNET_FS_uri_destroy (toplevel->ksk_uri);
  395. GNUNET_free_non_null (toplevel->filename);
  396. GNUNET_free_non_null (toplevel->short_filename);
  397. GNUNET_free (toplevel);
  398. }
  399. /* end fs_sharetree.c */