ITS#7589 avoid wasting space in mdb_page_split

Also, check the split point on branch pages as well as leaf pages.
vmware
Howard Chu 11 years ago
parent 01dfb2083d
commit 310b656a2e
  1. 217
      libraries/liblmdb/mdb.c

@ -7376,10 +7376,11 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
unsigned int nflags) unsigned int nflags)
{ {
unsigned int flags; unsigned int flags;
int rc = MDB_SUCCESS, ins_new = 0, new_root = 0, newpos = 1, did_split = 0; int rc = MDB_SUCCESS, new_root = 0, did_split = 0;
indx_t newindx; indx_t newindx;
pgno_t pgno = 0; pgno_t pgno = 0;
unsigned int i, j, split_indx, nkeys, pmax; unsigned int i, j, split_indx, nkeys, pmax;
MDB_env *env = mc->mc_txn->mt_env;
MDB_node *node; MDB_node *node;
MDB_val sepkey, rkey, xdata, *rdata = &xdata; MDB_val sepkey, rkey, xdata, *rdata = &xdata;
MDB_page *copy; MDB_page *copy;
@ -7390,10 +7391,11 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
mp = mc->mc_pg[mc->mc_top]; mp = mc->mc_pg[mc->mc_top];
newindx = mc->mc_ki[mc->mc_top]; newindx = mc->mc_ki[mc->mc_top];
nkeys = NUMKEYS(mp);
DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i", DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
DKEY(newkey), mc->mc_ki[mc->mc_top])); DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
/* Create a right sibling. */ /* Create a right sibling. */
if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
@ -7440,13 +7442,9 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
sepkey = *newkey; sepkey = *newkey;
split_indx = newindx; split_indx = newindx;
nkeys = 0; nkeys = 0;
goto newsep; } else {
}
nkeys = NUMKEYS(mp); split_indx = (nkeys+1) / 2;
split_indx = nkeys / 2;
if (newindx < split_indx)
newpos = 0;
if (IS_LEAF2(rp)) { if (IS_LEAF2(rp)) {
char *split, *ins; char *split, *ins;
@ -7488,93 +7486,104 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
mc->mc_ki[mc->mc_top] = x; mc->mc_ki[mc->mc_top] = x;
mc->mc_pg[mc->mc_top] = rp; mc->mc_pg[mc->mc_top] = rp;
} }
goto newsep; } else {
unsigned int psize, nsize, tsize;
int k;
/* Maximum free space in an empty page */
pmax = env->me_psize - PAGEHDRSZ;
if (IS_LEAF(mp))
nsize = mdb_leaf_size(env, newkey, newdata);
else
nsize = mdb_branch_size(env, newkey);
nsize += nsize & 1;
/* grab a page to hold a temporary copy */
copy = mdb_page_malloc(mc->mc_txn, 1);
if (copy == NULL)
return ENOMEM;
copy->mp_pgno = mp->mp_pgno;
copy->mp_flags = mp->mp_flags;
copy->mp_lower = PAGEHDRSZ;
copy->mp_upper = env->me_psize;
/* prepare to insert */
for (i=0, j=0; i<nkeys; i++) {
if (i == newindx) {
copy->mp_ptrs[j++] = 0;
}
copy->mp_ptrs[j++] = mp->mp_ptrs[i];
} }
/* For leaf pages, check the split point based on what /* When items are relatively large the split point needs
* fits where, since otherwise mdb_node_add can fail. * to be checked, because being off-by-one will make the
* * difference between success or failure in mdb_node_add.
* This check is only needed when the data items are
* relatively large, such that being off by one will
* make the difference between success or failure.
* *
* It's also relevant if a page happens to be laid out * It's also relevant if a page happens to be laid out
* such that one half of its nodes are all "small" and * such that one half of its nodes are all "small" and
* the other half of its nodes are "large." If the new * the other half of its nodes are "large." If the new
* item is also "large" and falls on the half with * item is also "large" and falls on the half with
* "large" nodes, it also may not fit. * "large" nodes, it also may not fit.
*/ *
if (IS_LEAF(mp)) { * As a final tweak, if the new item goes on the last
unsigned int psize, nsize; * spot on the page (and thus, onto the new page), bias
/* Maximum free space in an empty page */ * the split so the new page is emptier than the old page.
pmax = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ; * This yields better packing during sequential inserts.
nsize = mdb_leaf_size(mc->mc_txn->mt_env, newkey, newdata); */
if ((nkeys < 20) || (nsize > pmax/16)) { if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) {
if (newindx <= split_indx) { /* Find split point */
psize = nsize; psize = 0;
newpos = 0; if (newindx <= split_indx || newindx >= nkeys) {
for (i=0; i<split_indx; i++) { i = 0; j = 1;
node = NODEPTR(mp, i); k = newindx >= nkeys ? nkeys : split_indx+1;
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); } else {
if (F_ISSET(node->mn_flags, F_BIGDATA)) i = nkeys; j = -1;
psize += sizeof(pgno_t); k = split_indx-1;
else
psize += NODEDSZ(node);
psize += psize & 1;
if (psize > pmax) {
if (i <= newindx) {
split_indx = newindx;
if (i < newindx)
newpos = 1;
}
else
split_indx = i;
break;
}
} }
for (; i!=k; i+=j) {
if (i == newindx) {
tsize = nsize;
node = NULL;
} else { } else {
psize = nsize; node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
for (i=nkeys-1; i>=split_indx; i--) { tsize = NODESIZE + NODEKSZ(node) + sizeof(indx_t);
node = NODEPTR(mp, i); if (IS_LEAF(mp)) {
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
if (F_ISSET(node->mn_flags, F_BIGDATA)) if (F_ISSET(node->mn_flags, F_BIGDATA))
psize += sizeof(pgno_t); tsize += sizeof(pgno_t);
else else
psize += NODEDSZ(node); tsize += NODEDSZ(node);
psize += psize & 1;
if (psize > pmax) {
if (i >= newindx) {
split_indx = newindx;
newpos = 0;
} else
split_indx = i+1;
break;
}
} }
tsize += tsize & 1;
} }
if (psize + tsize > pmax) {
split_indx = i + (j<0);
break;
} }
psize += tsize;
} }
/* special case: when the new node was on the last
/* First find the separating key between the split pages. * slot we may not have tripped the break inside the loop.
* The case where newindx == split_indx is ambiguous; the * In all other cases we either hit the break condition,
* new item could go to the new page or stay on the original * or the original split_indx was already safe.
* page. If newpos == 1 it goes to the new page.
*/ */
if (newindx == split_indx && newpos) { if (newindx >= nkeys && i == k)
split_indx = nkeys-1;
}
if (split_indx == newindx) {
sepkey.mv_size = newkey->mv_size; sepkey.mv_size = newkey->mv_size;
sepkey.mv_data = newkey->mv_data; sepkey.mv_data = newkey->mv_data;
} else { } else {
node = NODEPTR(mp, split_indx); node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]);
sepkey.mv_size = node->mn_ksize; sepkey.mv_size = node->mn_ksize;
sepkey.mv_data = NODEKEY(node); sepkey.mv_data = NODEKEY(node);
} }
}
}
newsep: DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
DPRINTF(("separator is [%s]", DKEY(&sepkey)));
/* Copy separator key to the parent. /* Copy separator key to the parent.
*/ */
if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(mc->mc_txn->mt_env, &sepkey)) { if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
mn.mc_snum--; mn.mc_snum--;
mn.mc_top--; mn.mc_top--;
did_split = 1; did_split = 1;
@ -7619,36 +7628,13 @@ newsep:
return rc; return rc;
for (i=0; i<mc->mc_top; i++) for (i=0; i<mc->mc_top; i++)
mc->mc_ki[i] = mn.mc_ki[i]; mc->mc_ki[i] = mn.mc_ki[i];
goto done; } else if (!IS_LEAF2(mp)) {
} /* Move nodes */
if (IS_LEAF2(rp)) {
goto done;
}
/* Move half of the keys to the right sibling. */
/* grab a page to hold a temporary copy */
copy = mdb_page_malloc(mc->mc_txn, 1);
if (copy == NULL)
return ENOMEM;
copy->mp_pgno = mp->mp_pgno;
copy->mp_flags = mp->mp_flags;
copy->mp_lower = PAGEHDRSZ;
copy->mp_upper = mc->mc_txn->mt_env->me_psize;
mc->mc_pg[mc->mc_top] = copy;
for (i = j = 0; i <= nkeys; j++) {
if (i == split_indx) {
/* Insert in right sibling. */
/* Reset insert index for right sibling. */
if (i != newindx || (newpos ^ ins_new)) {
j = 0;
mc->mc_pg[mc->mc_top] = rp; mc->mc_pg[mc->mc_top] = rp;
} i = split_indx;
} j = 0;
do {
if (i == newindx && !ins_new) { if (i == newindx) {
/* Insert the original entry that caused the split. */
rkey.mv_data = newkey->mv_data; rkey.mv_data = newkey->mv_data;
rkey.mv_size = newkey->mv_size; rkey.mv_size = newkey->mv_size;
if (IS_LEAF(mp)) { if (IS_LEAF(mp)) {
@ -7656,15 +7642,10 @@ newsep:
} else } else
pgno = newpgno; pgno = newpgno;
flags = nflags; flags = nflags;
ins_new = 1;
/* Update index for the new key. */ /* Update index for the new key. */
mc->mc_ki[mc->mc_top] = j; mc->mc_ki[mc->mc_top] = j;
} else if (i == nkeys) {
break;
} else { } else {
node = NODEPTR(mp, i); node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
rkey.mv_data = NODEKEY(node); rkey.mv_data = NODEKEY(node);
rkey.mv_size = node->mn_ksize; rkey.mv_size = node->mn_ksize;
if (IS_LEAF(mp)) { if (IS_LEAF(mp)) {
@ -7674,8 +7655,6 @@ newsep:
} else } else
pgno = NODEPGNO(node); pgno = NODEPGNO(node);
flags = node->mn_flags; flags = node->mn_flags;
i++;
} }
if (!IS_LEAF(mp) && j == 0) { if (!IS_LEAF(mp) && j == 0) {
@ -7684,8 +7663,20 @@ newsep:
} }
rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
if (rc) break; if (rc) {
/* return tmp page to freelist */
mdb_page_free(env, copy);
return rc;
} }
if (i == nkeys) {
i = 0;
j = 0;
mc->mc_pg[mc->mc_top] = copy;
} else {
i++;
j++;
}
} while (i != split_indx);
nkeys = NUMKEYS(copy); nkeys = NUMKEYS(copy);
for (i=0; i<nkeys; i++) for (i=0; i<nkeys; i++)
@ -7693,10 +7684,10 @@ newsep:
mp->mp_lower = copy->mp_lower; mp->mp_lower = copy->mp_lower;
mp->mp_upper = copy->mp_upper; mp->mp_upper = copy->mp_upper;
memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
mc->mc_txn->mt_env->me_psize - copy->mp_upper); env->me_psize - copy->mp_upper);
/* reset back to original page */ /* reset back to original page */
if (newindx < split_indx || (!newpos && newindx == split_indx)) { if (newindx < split_indx) {
mc->mc_pg[mc->mc_top] = mp; mc->mc_pg[mc->mc_top] = mp;
if (nflags & MDB_RESERVE) { if (nflags & MDB_RESERVE) {
node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
@ -7704,6 +7695,7 @@ newsep:
newdata->mv_data = NODEDATA(node); newdata->mv_data = NODEDATA(node);
} }
} else { } else {
mc->mc_pg[mc->mc_top] = rp;
mc->mc_ki[ptop]++; mc->mc_ki[ptop]++;
/* Make sure mc_ki is still valid. /* Make sure mc_ki is still valid.
*/ */
@ -7717,10 +7709,10 @@ newsep:
mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
} }
} }
/* return tmp page to freelist */ /* return tmp page to freelist */
mdb_page_free(mc->mc_txn->mt_env, copy); mdb_page_free(env, copy);
done: }
{ {
/* Adjust other cursors pointing to mp */ /* Adjust other cursors pointing to mp */
MDB_cursor *m2, *m3; MDB_cursor *m2, *m3;
@ -7768,6 +7760,7 @@ done:
} }
} }
} }
DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
return rc; return rc;
} }

Loading…
Cancel
Save