bt_split.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /* $XConsortium: bt_split.c /main/3 1996/06/11 17:13:10 cde-hal $ */
  24. /*-
  25. * Copyright (c) 1990, 1993
  26. * The Regents of the University of California. All rights reserved.
  27. *
  28. * This code is derived from software contributed to Berkeley by
  29. * Mike Olson.
  30. *
  31. * Redistribution and use in source and binary forms, with or without
  32. * modification, are permitted provided that the following conditions
  33. * are met:
  34. * 1. Redistributions of source code must retain the above copyright
  35. * notice, this list of conditions and the following disclaimer.
  36. * 2. Redistributions in binary form must reproduce the above copyright
  37. * notice, this list of conditions and the following disclaimer in the
  38. * documentation and/or other materials provided with the distribution.
  39. * 3. All advertising materials mentioning features or use of this software
  40. * must display the following acknowledgement:
  41. * This product includes software developed by the University of
  42. * California, Berkeley and its contributors.
  43. * 4. Neither the name of the University nor the names of its contributors
  44. * may be used to endorse or promote products derived from this software
  45. * without specific prior written permission.
  46. *
  47. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57. * SUCH DAMAGE.
  58. */
  59. #if defined(LIBC_SCCS) && !defined(lint)
  60. static char sccsid[] = "@(#)bt_split.c 8.1 (Berkeley) 6/4/93";
  61. #endif /* LIBC_SCCS and not lint */
  62. #include <sys/types.h>
  63. #define __DBINTERFACE_PRIVATE
  64. #include <limits.h>
  65. #include <stdio.h>
  66. #include <stdlib.h>
  67. #include <string.h>
  68. #include <db.h>
  69. #include "btree.h"
  70. static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
  71. static PAGE *bt_page
  72. __P((BTREE *, PAGE *, PAGE **, PAGE **, u_int *, size_t));
  73. static int bt_preserve __P((BTREE *, pgno_t));
  74. static PAGE *bt_psplit
  75. __P((BTREE *, PAGE *, PAGE *, PAGE *, u_int *, size_t));
  76. static PAGE *bt_root
  77. __P((BTREE *, PAGE *, PAGE **, PAGE **, u_int *, size_t));
  78. static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
  79. static recno_t rec_total __P((PAGE *));
  80. #ifdef STATISTICS
  81. u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
  82. #endif
  83. /*
  84. * __BT_SPLIT -- Split the tree.
  85. *
  86. * Parameters:
  87. * t: tree
  88. * sp: page to split
  89. * key: key to insert
  90. * data: data to insert
  91. * flags: BIGKEY/BIGDATA flags
  92. * ilen: insert length
  93. * skip: index to leave open
  94. *
  95. * Returns:
  96. * RET_ERROR, RET_SUCCESS
  97. */
  98. int
  99. __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, u_long flags, size_t ilen, u_int skip)
  100. {
  101. BINTERNAL *bi = NULL;
  102. BLEAF *bl = NULL;
  103. BLEAF *tbl;
  104. DBT a, b;
  105. EPGNO *parent;
  106. PAGE *h, *l, *r, *lchild, *rchild;
  107. indx_t nxtindex;
  108. size_t n, nbytes;
  109. size_t nksize = 0;
  110. int parentsplit;
  111. char *dest;
  112. /*
  113. * Split the page into two pages, l and r. The split routines return
  114. * a pointer to the page into which the key should be inserted and with
  115. * skip set to the offset which should be used. Additionally, l and r
  116. * are pinned.
  117. */
  118. h = sp->pgno == P_ROOT ?
  119. bt_root(t, sp, &l, &r, &skip, ilen) :
  120. bt_page(t, sp, &l, &r, &skip, ilen);
  121. if (h == NULL)
  122. return (RET_ERROR);
  123. /*
  124. * Insert the new key/data pair into the leaf page. (Key inserts
  125. * always cause a leaf page to split first.)
  126. */
  127. h->linp[skip] = h->upper -= ilen;
  128. dest = (char *)h + h->upper;
  129. if (ISSET(t, R_RECNO))
  130. WR_RLEAF(dest, data, flags)
  131. else
  132. WR_BLEAF(dest, key, data, flags)
  133. /* If the root page was split, make it look right. */
  134. if (sp->pgno == P_ROOT &&
  135. (ISSET(t, R_RECNO) ?
  136. bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
  137. goto err2;
  138. /*
  139. * Now we walk the parent page stack -- a LIFO stack of the pages that
  140. * were traversed when we searched for the page that split. Each stack
  141. * entry is a page number and a page index offset. The offset is for
  142. * the page traversed on the search. We've just split a page, so we
  143. * have to insert a new key into the parent page.
  144. *
  145. * If the insert into the parent page causes it to split, may have to
  146. * continue splitting all the way up the tree. We stop if the root
  147. * splits or the page inserted into didn't have to split to hold the
  148. * new key. Some algorithms replace the key for the old page as well
  149. * as the new page. We don't, as there's no reason to believe that the
  150. * first key on the old page is any better than the key we have, and,
  151. * in the case of a key being placed at index 0 causing the split, the
  152. * key is unavailable.
  153. *
  154. * There are a maximum of 5 pages pinned at any time. We keep the left
  155. * and right pages pinned while working on the parent. The 5 are the
  156. * two children, left parent and right parent (when the parent splits)
  157. * and the root page or the overflow key page when calling bt_preserve.
  158. * This code must make sure that all pins are released other than the
  159. * root page or overflow page which is unlocked elsewhere.
  160. */
  161. while ((parent = BT_POP(t)) != NULL) {
  162. lchild = l;
  163. rchild = r;
  164. /* Get the parent page. */
  165. if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
  166. goto err2;
  167. /*
  168. * The new key goes ONE AFTER the index, because the split
  169. * was to the right.
  170. */
  171. skip = parent->index + 1;
  172. /*
  173. * Calculate the space needed on the parent page.
  174. *
  175. * Prefix trees: space hack when inserting into BINTERNAL
  176. * pages. Retain only what's needed to distinguish between
  177. * the new entry and the LAST entry on the page to its left.
  178. * If the keys compare equal, retain the entire key. Note,
  179. * we don't touch overflow keys, and the entire key must be
  180. * retained for the next-to-left most key on the leftmost
  181. * page of each level, or the search will fail. Applicable
  182. * ONLY to internal pages that have leaf pages as children.
  183. * Further reduction of the key between pairs of internal
  184. * pages loses too much information.
  185. */
  186. switch (rchild->flags & P_TYPE) {
  187. case P_BINTERNAL:
  188. bi = GETBINTERNAL(rchild, 0);
  189. nbytes = NBINTERNAL(bi->ksize);
  190. break;
  191. case P_BLEAF:
  192. bl = GETBLEAF(rchild, 0);
  193. nbytes = NBINTERNAL(bl->ksize);
  194. if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
  195. (h->prevpg != P_INVALID || skip > 1)) {
  196. tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
  197. a.size = tbl->ksize;
  198. a.data = tbl->bytes;
  199. b.size = bl->ksize;
  200. b.data = bl->bytes;
  201. nksize = t->bt_pfx(&a, &b);
  202. n = NBINTERNAL(nksize);
  203. if (n < nbytes) {
  204. #ifdef STATISTICS
  205. bt_pfxsaved += nbytes - n;
  206. #endif
  207. nbytes = n;
  208. } else
  209. nksize = 0;
  210. } else
  211. nksize = 0;
  212. break;
  213. case P_RINTERNAL:
  214. case P_RLEAF:
  215. nbytes = NRINTERNAL;
  216. break;
  217. default:
  218. abort();
  219. }
  220. /* Split the parent page if necessary or shift the indices. */
  221. if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
  222. sp = h;
  223. h = h->pgno == P_ROOT ?
  224. bt_root(t, h, &l, &r, &skip, nbytes) :
  225. bt_page(t, h, &l, &r, &skip, nbytes);
  226. if (h == NULL)
  227. goto err1;
  228. parentsplit = 1;
  229. } else {
  230. if (skip < (nxtindex = NEXTINDEX(h)))
  231. memmove(h->linp + skip + 1, h->linp + skip,
  232. (nxtindex - skip) * sizeof(indx_t));
  233. h->lower += sizeof(indx_t);
  234. parentsplit = 0;
  235. }
  236. /* Insert the key into the parent page. */
  237. switch(rchild->flags & P_TYPE) {
  238. case P_BINTERNAL:
  239. h->linp[skip] = h->upper -= nbytes;
  240. dest = (char *)h + h->linp[skip];
  241. memmove(dest, bi, nbytes);
  242. ((BINTERNAL *)dest)->pgno = rchild->pgno;
  243. break;
  244. case P_BLEAF:
  245. h->linp[skip] = h->upper -= nbytes;
  246. dest = (char *)h + h->linp[skip];
  247. WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
  248. rchild->pgno, bl->flags & P_BIGKEY);
  249. memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
  250. if (bl->flags & P_BIGKEY &&
  251. bt_preserve(t, *(char *)bl->bytes) == RET_ERROR)
  252. goto err1;
  253. break;
  254. case P_RINTERNAL:
  255. /*
  256. * Update the left page count. If split
  257. * added at index 0, fix the correct page.
  258. */
  259. if (skip > 0)
  260. dest = (char *)h + h->linp[skip - 1];
  261. else
  262. dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
  263. ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
  264. ((RINTERNAL *)dest)->pgno = lchild->pgno;
  265. /* Update the right page count. */
  266. h->linp[skip] = h->upper -= nbytes;
  267. dest = (char *)h + h->linp[skip];
  268. ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
  269. ((RINTERNAL *)dest)->pgno = rchild->pgno;
  270. break;
  271. case P_RLEAF:
  272. /*
  273. * Update the left page count. If split
  274. * added at index 0, fix the correct page.
  275. */
  276. if (skip > 0)
  277. dest = (char *)h + h->linp[skip - 1];
  278. else
  279. dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
  280. ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
  281. ((RINTERNAL *)dest)->pgno = lchild->pgno;
  282. /* Update the right page count. */
  283. h->linp[skip] = h->upper -= nbytes;
  284. dest = (char *)h + h->linp[skip];
  285. ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
  286. ((RINTERNAL *)dest)->pgno = rchild->pgno;
  287. break;
  288. default:
  289. abort();
  290. }
  291. /* Unpin the held pages. */
  292. if (!parentsplit) {
  293. mpool_put(t->bt_mp, h, MPOOL_DIRTY);
  294. break;
  295. }
  296. /* If the root page was split, make it look right. */
  297. if (sp->pgno == P_ROOT &&
  298. (ISSET(t, R_RECNO) ?
  299. bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
  300. goto err1;
  301. mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
  302. mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
  303. }
  304. /* Unpin the held pages. */
  305. mpool_put(t->bt_mp, l, MPOOL_DIRTY);
  306. mpool_put(t->bt_mp, r, MPOOL_DIRTY);
  307. /* Clear any pages left on the stack. */
  308. return (RET_SUCCESS);
  309. /*
  310. * If something fails in the above loop we were already walking back
  311. * up the tree and the tree is now inconsistent. Nothing much we can
  312. * do about it but release any memory we're holding.
  313. */
  314. err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
  315. mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
  316. err2: mpool_put(t->bt_mp, l, 0);
  317. mpool_put(t->bt_mp, r, 0);
  318. __dbpanic(t->bt_dbp);
  319. return (RET_ERROR);
  320. }
  321. /*
  322. * BT_PAGE -- Split a non-root page of a btree.
  323. *
  324. * Parameters:
  325. * t: tree
  326. * h: root page
  327. * lp: pointer to left page pointer
  328. * rp: pointer to right page pointer
  329. * skip: pointer to index to leave open
  330. * ilen: insert length
  331. *
  332. * Returns:
  333. * Pointer to page in which to insert or NULL on error.
  334. */
  335. static PAGE *
  336. bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, u_int *skip, size_t ilen)
  337. {
  338. PAGE *l, *r, *tp;
  339. pgno_t npg;
  340. #ifdef STATISTICS
  341. ++bt_split;
  342. #endif
  343. /* Put the new right page for the split into place. */
  344. if ((r = __bt_new(t, &npg)) == NULL)
  345. return (NULL);
  346. r->pgno = npg;
  347. r->lower = BTDATAOFF;
  348. r->upper = t->bt_psize;
  349. r->nextpg = h->nextpg;
  350. r->prevpg = h->pgno;
  351. r->flags = h->flags & P_TYPE;
  352. /*
  353. * If we're splitting the last page on a level because we're appending
  354. * a key to it (skip is NEXTINDEX()), it's likely that the data is
  355. * sorted. Adding an empty page on the side of the level is less work
  356. * and can push the fill factor much higher than normal. If we're
  357. * wrong it's no big deal, we'll just do the split the right way next
  358. * time. It may look like it's equally easy to do a similar hack for
  359. * reverse sorted data, that is, split the tree left, but it's not.
  360. * Don't even try.
  361. */
  362. if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
  363. #ifdef STATISTICS
  364. ++bt_sortsplit;
  365. #endif
  366. h->nextpg = r->pgno;
  367. r->lower = BTDATAOFF + sizeof(indx_t);
  368. *skip = 0;
  369. *lp = h;
  370. *rp = r;
  371. return (r);
  372. }
  373. /* Put the new left page for the split into place. */
  374. if ((l = malloc(t->bt_psize)) == NULL) {
  375. mpool_put(t->bt_mp, r, 0);
  376. return (NULL);
  377. }
  378. l->pgno = h->pgno;
  379. l->nextpg = r->pgno;
  380. l->prevpg = h->prevpg;
  381. l->lower = BTDATAOFF;
  382. l->upper = t->bt_psize;
  383. l->flags = h->flags & P_TYPE;
  384. /* Fix up the previous pointer of the page after the split page. */
  385. if (h->nextpg != P_INVALID) {
  386. if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
  387. free(l);
  388. /* XXX mpool_free(t->bt_mp, r->pgno); */
  389. return (NULL);
  390. }
  391. tp->prevpg = r->pgno;
  392. mpool_put(t->bt_mp, tp, 0);
  393. }
  394. /*
  395. * Split right. The key/data pairs aren't sorted in the btree page so
  396. * it's simpler to copy the data from the split page onto two new pages
  397. * instead of copying half the data to the right page and compacting
  398. * the left page in place. Since the left page can't change, we have
  399. * to swap the original and the allocated left page after the split.
  400. */
  401. tp = bt_psplit(t, h, l, r, skip, ilen);
  402. /* Move the new left page onto the old left page. */
  403. memmove(h, l, t->bt_psize);
  404. if (tp == l)
  405. tp = h;
  406. free(l);
  407. *lp = h;
  408. *rp = r;
  409. return (tp);
  410. }
  411. /*
  412. * BT_ROOT -- Split the root page of a btree.
  413. *
  414. * Parameters:
  415. * t: tree
  416. * h: root page
  417. * lp: pointer to left page pointer
  418. * rp: pointer to right page pointer
  419. * skip: pointer to index to leave open
  420. * ilen: insert length
  421. *
  422. * Returns:
  423. * Pointer to page in which to insert or NULL on error.
  424. */
  425. static PAGE *
  426. bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, u_int *skip, size_t ilen)
  427. {
  428. PAGE *l, *r, *tp;
  429. pgno_t lnpg, rnpg;
  430. #ifdef STATISTICS
  431. ++bt_split;
  432. ++bt_rootsplit;
  433. #endif
  434. /* Put the new left and right pages for the split into place. */
  435. if ((l = __bt_new(t, &lnpg)) == NULL ||
  436. (r = __bt_new(t, &rnpg)) == NULL)
  437. return (NULL);
  438. l->pgno = lnpg;
  439. r->pgno = rnpg;
  440. l->nextpg = r->pgno;
  441. r->prevpg = l->pgno;
  442. l->prevpg = r->nextpg = P_INVALID;
  443. l->lower = r->lower = BTDATAOFF;
  444. l->upper = r->upper = t->bt_psize;
  445. l->flags = r->flags = h->flags & P_TYPE;
  446. /* Split the root page. */
  447. tp = bt_psplit(t, h, l, r, skip, ilen);
  448. *lp = l;
  449. *rp = r;
  450. return (tp);
  451. }
  452. /*
  453. * BT_RROOT -- Fix up the recno root page after it has been split.
  454. *
  455. * Parameters:
  456. * t: tree
  457. * h: root page
  458. * l: left page
  459. * r: right page
  460. *
  461. * Returns:
  462. * RET_ERROR, RET_SUCCESS
  463. */
  464. static int
  465. bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
  466. {
  467. char *dest;
  468. /* Insert the left and right keys, set the header information. */
  469. h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
  470. dest = (char *)h + h->upper;
  471. WR_RINTERNAL(dest,
  472. l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
  473. h->linp[1] = h->upper -= NRINTERNAL;
  474. dest = (char *)h + h->upper;
  475. WR_RINTERNAL(dest,
  476. r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
  477. h->lower = BTDATAOFF + 2 * sizeof(indx_t);
  478. /* Unpin the root page, set to recno internal page. */
  479. h->flags &= ~P_TYPE;
  480. h->flags |= P_RINTERNAL;
  481. mpool_put(t->bt_mp, h, MPOOL_DIRTY);
  482. return (RET_SUCCESS);
  483. }
  484. /*
  485. * BT_BROOT -- Fix up the btree root page after it has been split.
  486. *
  487. * Parameters:
  488. * t: tree
  489. * h: root page
  490. * l: left page
  491. * r: right page
  492. *
  493. * Returns:
  494. * RET_ERROR, RET_SUCCESS
  495. */
  496. static int
  497. bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
  498. {
  499. BINTERNAL *bi;
  500. BLEAF *bl;
  501. size_t nbytes;
  502. char *dest;
  503. /*
  504. * If the root page was a leaf page, change it into an internal page.
  505. * We copy the key we split on (but not the key's data, in the case of
  506. * a leaf page) to the new root page.
  507. *
  508. * The btree comparison code guarantees that the left-most key on any
  509. * level of the tree is never used, so it doesn't need to be filled in.
  510. */
  511. nbytes = NBINTERNAL(0);
  512. h->linp[0] = h->upper = t->bt_psize - nbytes;
  513. dest = (char *)h + h->upper;
  514. WR_BINTERNAL(dest, 0, l->pgno, 0);
  515. switch(h->flags & P_TYPE) {
  516. case P_BLEAF:
  517. bl = GETBLEAF(r, 0);
  518. nbytes = NBINTERNAL(bl->ksize);
  519. h->linp[1] = h->upper -= nbytes;
  520. dest = (char *)h + h->upper;
  521. WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
  522. memmove(dest, bl->bytes, bl->ksize);
  523. /*
  524. * If the key is on an overflow page, mark the overflow chain
  525. * so it isn't deleted when the leaf copy of the key is deleted.
  526. */
  527. if (bl->flags & P_BIGKEY &&
  528. bt_preserve(t, *(char *)bl->bytes) == RET_ERROR)
  529. return (RET_ERROR);
  530. break;
  531. case P_BINTERNAL:
  532. bi = GETBINTERNAL(r, 0);
  533. nbytes = NBINTERNAL(bi->ksize);
  534. h->linp[1] = h->upper -= nbytes;
  535. dest = (char *)h + h->upper;
  536. memmove(dest, bi, nbytes);
  537. ((BINTERNAL *)dest)->pgno = r->pgno;
  538. break;
  539. default:
  540. abort();
  541. }
  542. /* There are two keys on the page. */
  543. h->lower = BTDATAOFF + 2 * sizeof(indx_t);
  544. /* Unpin the root page, set to btree internal page. */
  545. h->flags &= ~P_TYPE;
  546. h->flags |= P_BINTERNAL;
  547. mpool_put(t->bt_mp, h, MPOOL_DIRTY);
  548. return (RET_SUCCESS);
  549. }
  550. /*
  551. * BT_PSPLIT -- Do the real work of splitting the page.
  552. *
  553. * Parameters:
  554. * t: tree
  555. * h: page to be split
  556. * l: page to put lower half of data
  557. * r: page to put upper half of data
  558. * pskip: pointer to index to leave open
  559. * ilen: insert length
  560. *
  561. * Returns:
  562. * Pointer to page in which to insert.
  563. */
  564. static PAGE *
  565. bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, u_int *pskip, size_t ilen)
  566. {
  567. BINTERNAL *bi;
  568. BLEAF *bl;
  569. RLEAF *rl;
  570. EPGNO *c;
  571. PAGE *rval;
  572. void *src = NULL;
  573. indx_t full, half, nxt, off, skip, top, used;
  574. size_t nbytes;
  575. int bigkeycnt, isbigkey;
  576. /*
  577. * Split the data to the left and right pages. Leave the skip index
  578. * open. Additionally, make some effort not to split on an overflow
  579. * key. This makes internal page processing faster and can save
  580. * space as overflow keys used by internal pages are never deleted.
  581. */
  582. bigkeycnt = 0;
  583. skip = *pskip;
  584. full = t->bt_psize - BTDATAOFF;
  585. half = full / 2;
  586. used = 0;
  587. for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
  588. if (skip == off) {
  589. nbytes = ilen;
  590. isbigkey = 0; /* XXX: not really known. */
  591. } else
  592. switch (h->flags & P_TYPE) {
  593. case P_BINTERNAL:
  594. src = bi = GETBINTERNAL(h, nxt);
  595. nbytes = NBINTERNAL(bi->ksize);
  596. isbigkey = bi->flags & P_BIGKEY;
  597. break;
  598. case P_BLEAF:
  599. src = bl = GETBLEAF(h, nxt);
  600. nbytes = NBLEAF(bl);
  601. isbigkey = bl->flags & P_BIGKEY;
  602. break;
  603. case P_RINTERNAL:
  604. src = GETRINTERNAL(h, nxt);
  605. nbytes = NRINTERNAL;
  606. isbigkey = 0;
  607. break;
  608. case P_RLEAF:
  609. src = rl = GETRLEAF(h, nxt);
  610. nbytes = NRLEAF(rl);
  611. isbigkey = 0;
  612. break;
  613. default:
  614. abort();
  615. }
  616. /*
  617. * If the key/data pairs are substantial fractions of the max
  618. * possible size for the page, it's possible to get situations
  619. * where we decide to try and copy too much onto the left page.
  620. * Make sure that doesn't happen.
  621. */
  622. if (skip <= off && used + nbytes >= full) {
  623. --off;
  624. break;
  625. }
  626. /* Copy the key/data pair, if not the skipped index. */
  627. if (skip != off) {
  628. ++nxt;
  629. l->linp[off] = l->upper -= nbytes;
  630. memmove((char *)l + l->upper, src, nbytes);
  631. }
  632. used += nbytes;
  633. if (used >= half) {
  634. if (!isbigkey || bigkeycnt == 3)
  635. break;
  636. else
  637. ++bigkeycnt;
  638. }
  639. }
  640. /*
  641. * Off is the last offset that's valid for the left page.
  642. * Nxt is the first offset to be placed on the right page.
  643. */
  644. l->lower += (off + 1) * sizeof(indx_t);
  645. /*
  646. * If splitting the page that the cursor was on, the cursor has to be
  647. * adjusted to point to the same record as before the split. If the
  648. * cursor is at or past the skipped slot, the cursor is incremented by
  649. * one. If the cursor is on the right page, it is decremented by the
  650. * number of records split to the left page.
  651. *
  652. * Don't bother checking for the B_SEQINIT flag, the page number will
  653. * be P_INVALID.
  654. */
  655. c = &t->bt_bcursor;
  656. if (c->pgno == h->pgno) {
  657. if (c->index >= skip)
  658. ++c->index;
  659. if (c->index < nxt) /* Left page. */
  660. c->pgno = l->pgno;
  661. else { /* Right page. */
  662. c->pgno = r->pgno;
  663. c->index -= nxt;
  664. }
  665. }
  666. /*
  667. * If the skipped index was on the left page, just return that page.
  668. * Otherwise, adjust the skip index to reflect the new position on
  669. * the right page.
  670. */
  671. if (skip <= off) {
  672. skip = 0;
  673. rval = l;
  674. } else {
  675. rval = r;
  676. *pskip -= nxt;
  677. }
  678. for (off = 0; nxt < top; ++off) {
  679. if (skip == nxt) {
  680. ++off;
  681. skip = 0;
  682. }
  683. switch (h->flags & P_TYPE) {
  684. case P_BINTERNAL:
  685. src = bi = GETBINTERNAL(h, nxt);
  686. nbytes = NBINTERNAL(bi->ksize);
  687. break;
  688. case P_BLEAF:
  689. src = bl = GETBLEAF(h, nxt);
  690. nbytes = NBLEAF(bl);
  691. break;
  692. case P_RINTERNAL:
  693. src = GETRINTERNAL(h, nxt);
  694. nbytes = NRINTERNAL;
  695. break;
  696. case P_RLEAF:
  697. src = rl = GETRLEAF(h, nxt);
  698. nbytes = NRLEAF(rl);
  699. break;
  700. default:
  701. abort();
  702. }
  703. ++nxt;
  704. r->linp[off] = r->upper -= nbytes;
  705. memmove((char *)r + r->upper, src, nbytes);
  706. }
  707. r->lower += off * sizeof(indx_t);
  708. /* If the key is being appended to the page, adjust the index. */
  709. if (skip == top)
  710. r->lower += sizeof(indx_t);
  711. return (rval);
  712. }
  713. /*
  714. * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
  715. *
  716. * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
  717. * record that references them gets deleted. Chains pointed to by internal
  718. * pages never get deleted. This routine marks a chain as pointed to by an
  719. * internal page.
  720. *
  721. * Parameters:
  722. * t: tree
  723. * pg: page number of first page in the chain.
  724. *
  725. * Returns:
  726. * RET_SUCCESS, RET_ERROR.
  727. */
  728. static int
  729. bt_preserve(BTREE *t, pgno_t pg)
  730. {
  731. PAGE *h;
  732. if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
  733. return (RET_ERROR);
  734. h->flags |= P_PRESERVE;
  735. mpool_put(t->bt_mp, h, MPOOL_DIRTY);
  736. return (RET_SUCCESS);
  737. }
  738. /*
  739. * REC_TOTAL -- Return the number of recno entries below a page.
  740. *
  741. * Parameters:
  742. * h: page
  743. *
  744. * Returns:
  745. * The number of recno entries below a page.
  746. *
  747. * XXX
  748. * These values could be set by the bt_psplit routine. The problem is that the
  749. * entry has to be popped off of the stack etc. or the values have to be passed
  750. * all the way back to bt_split/bt_rroot and it's not very clean.
  751. */
  752. static recno_t
  753. rec_total(PAGE *h)
  754. {
  755. recno_t recs;
  756. indx_t nxt, top;
  757. for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
  758. recs += GETRINTERNAL(h, nxt)->nrecs;
  759. return (recs);
  760. }