080-06-fib_trie-Optimize-fib_table_lookup-to-avoid-wasting-.patch 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. From: Alexander Duyck <alexander.h.duyck@redhat.com>
  2. Date: Wed, 31 Dec 2014 10:55:54 -0800
  3. Subject: [PATCH] fib_trie: Optimize fib_table_lookup to avoid wasting
  4. time on loops/variables
  5. This patch is meant to reduce the complexity of fib_table_lookup by reducing
  6. the number of variables to the bare minimum while still keeping the same if
  7. not improved functionality versus the original.
  8. Most of this change was started off by the desire to rid the function of
  9. chopped_off and current_prefix_length as they actually added very little to
  10. the function since they only applied when computing the cindex. I was able
  11. to replace them mostly with just a check for the prefix match. As long as
  12. the prefix between the key and the node being tested was the same we know
  13. we can search the tnode fully versus just testing cindex 0.
  14. The second portion of the change ended up being a massive reordering.
  15. Originally the calls to check_leaf were up near the start of the loop, and
  16. the backtracing and descending into lower levels of tnodes was later. This
  17. didn't make much sense as the structure of the tree means the leaves are
  18. always the last thing to be tested. As such I reordered things so that we
  19. instead have a loop that will delve into the tree and only exit when we
  20. have either found a leaf or we have exhausted the tree. The advantage of
  21. rearranging things like this is that we can fully inline check_leaf since
  22. there is now only one reference to it in the function.
  23. Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
  24. Signed-off-by: David S. Miller <davem@davemloft.net>
  25. ---
  26. --- a/net/ipv4/fib_trie.c
  27. +++ b/net/ipv4/fib_trie.c
  28. @@ -90,6 +90,9 @@ typedef unsigned int t_key;
  29. #define IS_TNODE(n) ((n)->bits)
  30. #define IS_LEAF(n) (!(n)->bits)
  31. +#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
  32. +#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
  33. +
  34. struct tnode {
  35. t_key key;
  36. unsigned char bits; /* 2log(KEYLENGTH) bits needed */
  37. @@ -1281,7 +1284,7 @@ static int check_leaf(struct fib_table *
  38. continue;
  39. fib_alias_accessed(fa);
  40. err = fib_props[fa->fa_type].error;
  41. - if (err) {
  42. + if (unlikely(err < 0)) {
  43. #ifdef CONFIG_IP_FIB_TRIE_STATS
  44. this_cpu_inc(t->stats->semantic_match_passed);
  45. #endif
  46. @@ -1303,7 +1306,7 @@ static int check_leaf(struct fib_table *
  47. res->prefixlen = li->plen;
  48. res->nh_sel = nhsel;
  49. res->type = fa->fa_type;
  50. - res->scope = fa->fa_info->fib_scope;
  51. + res->scope = fi->fib_scope;
  52. res->fi = fi;
  53. res->table = tb;
  54. res->fa_head = &li->falh;
  55. @@ -1321,23 +1324,24 @@ static int check_leaf(struct fib_table *
  56. return 1;
  57. }
  58. +static inline t_key prefix_mismatch(t_key key, struct tnode *n)
  59. +{
  60. + t_key prefix = n->key;
  61. +
  62. + return (key ^ prefix) & (prefix | -prefix);
  63. +}
  64. +
  65. int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
  66. struct fib_result *res, int fib_flags)
  67. {
  68. - struct trie *t = (struct trie *) tb->tb_data;
  69. + struct trie *t = (struct trie *)tb->tb_data;
  70. #ifdef CONFIG_IP_FIB_TRIE_STATS
  71. struct trie_use_stats __percpu *stats = t->stats;
  72. #endif
  73. - int ret;
  74. - struct tnode *n;
  75. - struct tnode *pn;
  76. - unsigned int pos, bits;
  77. - t_key key = ntohl(flp->daddr);
  78. - unsigned int chopped_off;
  79. - t_key cindex = 0;
  80. - unsigned int current_prefix_length = KEYLENGTH;
  81. - struct tnode *cn;
  82. - t_key pref_mismatch;
  83. + const t_key key = ntohl(flp->daddr);
  84. + struct tnode *n, *pn;
  85. + t_key cindex;
  86. + int ret = 1;
  87. rcu_read_lock();
  88. @@ -1349,170 +1353,102 @@ int fib_table_lookup(struct fib_table *t
  89. this_cpu_inc(stats->gets);
  90. #endif
  91. - /* Just a leaf? */
  92. - if (IS_LEAF(n)) {
  93. - ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
  94. - goto found;
  95. - }
  96. -
  97. pn = n;
  98. - chopped_off = 0;
  99. -
  100. - while (pn) {
  101. - pos = pn->pos;
  102. - bits = pn->bits;
  103. + cindex = 0;
  104. - if (!chopped_off)
  105. - cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
  106. - pos, bits);
  107. -
  108. - n = tnode_get_child_rcu(pn, cindex);
  109. -
  110. - if (n == NULL) {
  111. -#ifdef CONFIG_IP_FIB_TRIE_STATS
  112. - this_cpu_inc(stats->null_node_hit);
  113. -#endif
  114. - goto backtrace;
  115. - }
  116. + /* Step 1: Travel to the longest prefix match in the trie */
  117. + for (;;) {
  118. + unsigned long index = get_index(key, n);
  119. +
  120. + /* This bit of code is a bit tricky but it combines multiple
  121. + * checks into a single check. The prefix consists of the
  122. + * prefix plus zeros for the "bits" in the prefix. The index
  123. + * is the difference between the key and this value. From
  124. + * this we can actually derive several pieces of data.
  125. + * if !(index >> bits)
  126. + * we know the value is child index
  127. + * else
  128. + * we have a mismatch in skip bits and failed
  129. + */
  130. + if (index >> n->bits)
  131. + break;
  132. - if (IS_LEAF(n)) {
  133. - ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
  134. - if (ret > 0)
  135. - goto backtrace;
  136. + /* we have found a leaf. Prefixes have already been compared */
  137. + if (IS_LEAF(n))
  138. goto found;
  139. - }
  140. -
  141. - cn = n;
  142. - /*
  143. - * It's a tnode, and we can do some extra checks here if we
  144. - * like, to avoid descending into a dead-end branch.
  145. - * This tnode is in the parent's child array at index
  146. - * key[p_pos..p_pos+p_bits] but potentially with some bits
  147. - * chopped off, so in reality the index may be just a
  148. - * subprefix, padded with zero at the end.
  149. - * We can also take a look at any skipped bits in this
  150. - * tnode - everything up to p_pos is supposed to be ok,
  151. - * and the non-chopped bits of the index (se previous
  152. - * paragraph) are also guaranteed ok, but the rest is
  153. - * considered unknown.
  154. - *
  155. - * The skipped bits are key[pos+bits..cn->pos].
  156. - */
  157. -
  158. - /* If current_prefix_length < pos+bits, we are already doing
  159. - * actual prefix matching, which means everything from
  160. - * pos+(bits-chopped_off) onward must be zero along some
  161. - * branch of this subtree - otherwise there is *no* valid
  162. - * prefix present. Here we can only check the skipped
  163. - * bits. Remember, since we have already indexed into the
  164. - * parent's child array, we know that the bits we chopped of
  165. - * *are* zero.
  166. + /* only record pn and cindex if we are going to be chopping
  167. + * bits later. Otherwise we are just wasting cycles.
  168. */
  169. -
  170. - /* NOTA BENE: Checking only skipped bits
  171. - for the new node here */
  172. -
  173. - if (current_prefix_length < pos+bits) {
  174. - if (tkey_extract_bits(cn->key, current_prefix_length,
  175. - cn->pos - current_prefix_length)
  176. - || !(cn->child[0]))
  177. - goto backtrace;
  178. + if (index) {
  179. + pn = n;
  180. + cindex = index;
  181. }
  182. - /*
  183. - * If chopped_off=0, the index is fully validated and we
  184. - * only need to look at the skipped bits for this, the new,
  185. - * tnode. What we actually want to do is to find out if
  186. - * these skipped bits match our key perfectly, or if we will
  187. - * have to count on finding a matching prefix further down,
  188. - * because if we do, we would like to have some way of
  189. - * verifying the existence of such a prefix at this point.
  190. - */
  191. -
  192. - /* The only thing we can do at this point is to verify that
  193. - * any such matching prefix can indeed be a prefix to our
  194. - * key, and if the bits in the node we are inspecting that
  195. - * do not match our key are not ZERO, this cannot be true.
  196. - * Thus, find out where there is a mismatch (before cn->pos)
  197. - * and verify that all the mismatching bits are zero in the
  198. - * new tnode's key.
  199. - */
  200. + n = rcu_dereference(n->child[index]);
  201. + if (unlikely(!n))
  202. + goto backtrace;
  203. + }
  204. - /*
  205. - * Note: We aren't very concerned about the piece of
  206. - * the key that precede pn->pos+pn->bits, since these
  207. - * have already been checked. The bits after cn->pos
  208. - * aren't checked since these are by definition
  209. - * "unknown" at this point. Thus, what we want to see
  210. - * is if we are about to enter the "prefix matching"
  211. - * state, and in that case verify that the skipped
  212. - * bits that will prevail throughout this subtree are
  213. - * zero, as they have to be if we are to find a
  214. - * matching prefix.
  215. + /* Step 2: Sort out leaves and begin backtracing for longest prefix */
  216. + for (;;) {
  217. + /* record the pointer where our next node pointer is stored */
  218. + struct tnode __rcu **cptr = n->child;
  219. +
  220. + /* This test verifies that none of the bits that differ
  221. + * between the key and the prefix exist in the region of
  222. + * the lsb and higher in the prefix.
  223. */
  224. + if (unlikely(prefix_mismatch(key, n)))
  225. + goto backtrace;
  226. - pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
  227. + /* exit out and process leaf */
  228. + if (unlikely(IS_LEAF(n)))
  229. + break;
  230. - /*
  231. - * In short: If skipped bits in this node do not match
  232. - * the search key, enter the "prefix matching"
  233. - * state.directly.
  234. + /* Don't bother recording parent info. Since we are in
  235. + * prefix match mode we will have to come back to wherever
  236. + * we started this traversal anyway
  237. */
  238. - if (pref_mismatch) {
  239. - /* fls(x) = __fls(x) + 1 */
  240. - int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
  241. -
  242. - if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
  243. - goto backtrace;
  244. -
  245. - if (current_prefix_length >= cn->pos)
  246. - current_prefix_length = mp;
  247. - }
  248. -
  249. - pn = n; /* Descend */
  250. - chopped_off = 0;
  251. - continue;
  252. + while ((n = rcu_dereference(*cptr)) == NULL) {
  253. backtrace:
  254. - chopped_off++;
  255. -
  256. - /* As zero don't change the child key (cindex) */
  257. - while ((chopped_off <= pn->bits)
  258. - && !(cindex & (1<<(chopped_off-1))))
  259. - chopped_off++;
  260. -
  261. - /* Decrease current_... with bits chopped off */
  262. - if (current_prefix_length > pn->pos + pn->bits - chopped_off)
  263. - current_prefix_length = pn->pos + pn->bits
  264. - - chopped_off;
  265. -
  266. - /*
  267. - * Either we do the actual chop off according or if we have
  268. - * chopped off all bits in this tnode walk up to our parent.
  269. - */
  270. -
  271. - if (chopped_off <= pn->bits) {
  272. - cindex &= ~(1 << (chopped_off-1));
  273. - } else {
  274. - struct tnode *parent = node_parent_rcu(pn);
  275. - if (!parent)
  276. - goto failed;
  277. -
  278. - /* Get Child's index */
  279. - cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
  280. - pn = parent;
  281. - chopped_off = 0;
  282. -
  283. #ifdef CONFIG_IP_FIB_TRIE_STATS
  284. - this_cpu_inc(stats->backtrack);
  285. + if (!n)
  286. + this_cpu_inc(stats->null_node_hit);
  287. #endif
  288. - goto backtrace;
  289. + /* If we are at cindex 0 there are no more bits for
  290. + * us to strip at this level so we must ascend back
  291. + * up one level to see if there are any more bits to
  292. + * be stripped there.
  293. + */
  294. + while (!cindex) {
  295. + t_key pkey = pn->key;
  296. +
  297. + pn = node_parent_rcu(pn);
  298. + if (unlikely(!pn))
  299. + goto failed;
  300. +#ifdef CONFIG_IP_FIB_TRIE_STATS
  301. + this_cpu_inc(stats->backtrack);
  302. +#endif
  303. + /* Get Child's index */
  304. + cindex = get_index(pkey, pn);
  305. + }
  306. +
  307. + /* strip the least significant bit from the cindex */
  308. + cindex &= cindex - 1;
  309. +
  310. + /* grab pointer for next child node */
  311. + cptr = &pn->child[cindex];
  312. }
  313. }
  314. -failed:
  315. - ret = 1;
  316. +
  317. found:
  318. + /* Step 3: Process the leaf, if that fails fall back to backtracing */
  319. + ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
  320. + if (unlikely(ret > 0))
  321. + goto backtrace;
  322. +failed:
  323. rcu_read_unlock();
  324. return ret;
  325. }