LockTree library, originally from PerconaFT (#7753)

Summary:
To be used for implementing Range Locking.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7753

Reviewed By: zhichao-cao

Differential Revision: D25378980

Pulled By: cheng-chang

fbshipit-source-id: 801a9c5cd92a84654ca2586b73e8f69001e89320
main
Sergei Petrunia 4 years ago committed by Facebook GitHub Bot
parent 7b2216c906
commit 98236fb10e
  1. 14
      CMakeLists.txt
  2. 24
      TARGETS
  3. 12
      src.mk
  4. 661
      utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3
  5. 174
      utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2
  6. 339
      utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2
  7. 13
      utilities/transactions/lock/range/range_tree/lib/README
  8. 76
      utilities/transactions/lock/range/range_tree/lib/db.h
  9. 124
      utilities/transactions/lock/range/range_tree/lib/ft/comparator.h
  10. 88
      utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h
  11. 139
      utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
  12. 174
      utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
  13. 221
      utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
  14. 140
      utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
  15. 534
      utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
  16. 238
      utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
  17. 1000
      utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
  18. 559
      utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
  19. 526
      utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
  20. 264
      utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
  21. 177
      utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
  22. 519
      utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
  23. 301
      utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
  24. 119
      utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
  25. 91
      utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
  26. 212
      utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
  27. 123
      utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
  28. 201
      utilities/transactions/lock/range/range_tree/lib/portability/memory.h
  29. 37
      utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h
  30. 116
      utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h
  31. 82
      utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h
  32. 240
      utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h
  33. 73
      utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h
  34. 501
      utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h
  35. 165
      utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h
  36. 158
      utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h
  37. 27
      utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h
  38. 132
      utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
  39. 153
      utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
  40. 84
      utilities/transactions/lock/range/range_tree/lib/util/dbt.h
  41. 143
      utilities/transactions/lock/range/range_tree/lib/util/growable_array.h
  42. 187
      utilities/transactions/lock/range/range_tree/lib/util/memarena.cc
  43. 127
      utilities/transactions/lock/range/range_tree/lib/util/memarena.h
  44. 793
      utilities/transactions/lock/range/range_tree/lib/util/omt.h
  45. 1294
      utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h
  46. 151
      utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h
  47. 62
      utilities/transactions/lock/range/range_tree/lib/util/status.h

@ -831,6 +831,20 @@ set(SOURCES
utilities/write_batch_with_index/write_batch_with_index_internal.cc utilities/write_batch_with_index/write_batch_with_index_internal.cc
$<TARGET_OBJECTS:build_version>) $<TARGET_OBJECTS:build_version>)
list(APPEND SOURCE
utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
utilities/transactions/lock/range/range_tree/lib/util/memarena.cc)
if(HAVE_SSE42 AND NOT MSVC) if(HAVE_SSE42 AND NOT MSVC)
set_source_files_properties( set_source_files_properties(
util/crc32c.cc util/crc32c.cc

@ -388,6 +388,18 @@ cpp_library(
"utilities/transactions/lock/lock_manager.cc", "utilities/transactions/lock/lock_manager.cc",
"utilities/transactions/lock/point/point_lock_manager.cc", "utilities/transactions/lock/point/point_lock_manager.cc",
"utilities/transactions/lock/point/point_lock_tracker.cc", "utilities/transactions/lock/point/point_lock_tracker.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
"utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
"utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
"utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
"utilities/transactions/optimistic_transaction.cc", "utilities/transactions/optimistic_transaction.cc",
"utilities/transactions/optimistic_transaction_db_impl.cc", "utilities/transactions/optimistic_transaction_db_impl.cc",
"utilities/transactions/pessimistic_transaction.cc", "utilities/transactions/pessimistic_transaction.cc",
@ -678,6 +690,18 @@ cpp_library(
"utilities/transactions/lock/lock_manager.cc", "utilities/transactions/lock/lock_manager.cc",
"utilities/transactions/lock/point/point_lock_manager.cc", "utilities/transactions/lock/point/point_lock_manager.cc",
"utilities/transactions/lock/point/point_lock_tracker.cc", "utilities/transactions/lock/point/point_lock_tracker.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
"utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
"utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
"utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
"utilities/transactions/optimistic_transaction.cc", "utilities/transactions/optimistic_transaction.cc",
"utilities/transactions/optimistic_transaction_db_impl.cc", "utilities/transactions/optimistic_transaction_db_impl.cc",
"utilities/transactions/pessimistic_transaction.cc", "utilities/transactions/pessimistic_transaction.cc",

@ -255,6 +255,18 @@ LIB_SOURCES = \
utilities/transactions/lock/lock_manager.cc \ utilities/transactions/lock/lock_manager.cc \
utilities/transactions/lock/point/point_lock_tracker.cc \ utilities/transactions/lock/point/point_lock_tracker.cc \
utilities/transactions/lock/point/point_lock_manager.cc \ utilities/transactions/lock/point/point_lock_manager.cc \
utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc \
utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc \
utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc \
utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc \
utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc \
utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc \
utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc \
utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc \
utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc \
utilities/transactions/lock/range/range_tree/lib/standalone_port.cc \
utilities/transactions/lock/range/range_tree/lib/util/dbt.cc \
utilities/transactions/lock/range/range_tree/lib/util/memarena.cc \
utilities/transactions/optimistic_transaction.cc \ utilities/transactions/optimistic_transaction.cc \
utilities/transactions/optimistic_transaction_db_impl.cc \ utilities/transactions/optimistic_transaction_db_impl.cc \
utilities/transactions/pessimistic_transaction.cc \ utilities/transactions/pessimistic_transaction.cc \

@ -0,0 +1,661 @@
GNU AFFERO GENERAL PUBLIC LICENSE
Version 3, 19 November 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU Affero General Public License is a free, copyleft license for
software and other kinds of works, specifically designed to ensure
cooperation with the community in the case of network server software.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
our General Public Licenses are intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
Developers that use our General Public Licenses protect your rights
with two steps: (1) assert copyright on the software, and (2) offer
you this License which gives you legal permission to copy, distribute
and/or modify the software.
A secondary benefit of defending all users' freedom is that
improvements made in alternate versions of the program, if they
receive widespread use, become available for other developers to
incorporate. Many developers of free software are heartened and
encouraged by the resulting cooperation. However, in the case of
software used on network servers, this result may fail to come about.
The GNU General Public License permits making a modified version and
letting the public access it on a server without ever releasing its
source code to the public.
The GNU Affero General Public License is designed specifically to
ensure that, in such cases, the modified source code becomes available
to the community. It requires the operator of a network server to
provide the source code of the modified version running there to the
users of that server. Therefore, public use of a modified version, on
a publicly accessible server, gives the public access to the source
code of the modified version.
An older license, called the Affero General Public License and
published by Affero, was designed to accomplish similar goals. This is
a different license, not a version of the Affero GPL, but Affero has
released a new version of the Affero GPL which permits relicensing under
this license.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU Affero General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Remote Network Interaction; Use with the GNU General Public License.
Notwithstanding any other provision of this License, if you modify the
Program, your modified version must prominently offer all users
interacting with it remotely through a computer network (if your version
supports such interaction) an opportunity to receive the Corresponding
Source of your version by providing access to the Corresponding Source
from a network server at no charge, through some standard or customary
means of facilitating copying of software. This Corresponding Source
shall include the Corresponding Source for any work covered by version 3
of the GNU General Public License that is incorporated pursuant to the
following paragraph.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the work with which it is combined will remain governed by version
3 of the GNU General Public License.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU Affero General Public License from time to time. Such new versions
will be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU Affero General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU Affero General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU Affero General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If your software can interact with users remotely through a computer
network, you should also make sure that it provides a way for users to
get its source. For example, if your program is a web application, its
interface could display a "Source" link that leads users to an archive
of the code. There are many ways you could offer source, and different
solutions will be better for different programs; see section 13 for the
specific requirements.
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU AGPL, see
<http://www.gnu.org/licenses/>.

@ -0,0 +1,174 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.

@ -0,0 +1,339 @@
GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Lesser General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:
Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
`Gnomovision' (which makes passes at compilers) written by James Hacker.
<signature of Ty Coon>, 1 April 1989
Ty Coon, President of Vice
This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License.

@ -0,0 +1,13 @@
The files in this directory originally come from
https://github.com/percona/PerconaFT/.
This directory only includes the "locktree" part of PerconaFT, and its
dependencies.
The following modifications were made:
- Make locktree usable outside of PerconaFT library
- Add shared read-only lock support
The files named *_subst.* are substitutes of the PerconaFT's files, they
contain replacements of PerconaFT's functionality.

@ -0,0 +1,76 @@
#ifndef _DB_H
#define _DB_H
#include <stdint.h>
#include <sys/types.h>
typedef struct __toku_dbt DBT;
// port: this is currently not used
struct simple_dbt {
uint32_t len;
void *data;
};
// engine status info
// engine status is passed to handlerton as an array of
// TOKU_ENGINE_STATUS_ROW_S[]
typedef enum {
STATUS_FS_STATE = 0, // interpret as file system state (redzone) enum
STATUS_UINT64, // interpret as uint64_t
STATUS_CHARSTR, // interpret as char *
STATUS_UNIXTIME, // interpret as time_t
STATUS_TOKUTIME, // interpret as tokutime_t
STATUS_PARCOUNT, // interpret as PARTITIONED_COUNTER
STATUS_DOUBLE // interpret as double
} toku_engine_status_display_type;
typedef enum {
TOKU_ENGINE_STATUS = (1ULL << 0), // Include when asking for engine status
TOKU_GLOBAL_STATUS =
(1ULL << 1), // Include when asking for information_schema.global_status
} toku_engine_status_include_type;
typedef struct __toku_engine_status_row {
const char *keyname; // info schema key, should not change across revisions
// without good reason
const char
*columnname; // column for mysql, e.g. information_schema.global_status.
// TOKUDB_ will automatically be prefixed.
const char *legend; // the text that will appear at user interface
toku_engine_status_display_type type; // how to interpret the value
toku_engine_status_include_type
include; // which kinds of callers should get read this row?
union {
double dnum;
uint64_t num;
const char *str;
char datebuf[26];
struct partitioned_counter *parcount;
} value;
} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S;
#define DB_BUFFER_SMALL -30999
#define DB_LOCK_DEADLOCK -30995
#define DB_LOCK_NOTGRANTED -30994
#define DB_NOTFOUND -30989
#define DB_KEYEXIST -30996
#define DB_DBT_MALLOC 8
#define DB_DBT_REALLOC 64
#define DB_DBT_USERMEM 256
/* PerconaFT specific error codes */
#define TOKUDB_OUT_OF_LOCKS -100000
typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid,
uint64_t blocking_txnid);
struct __toku_dbt {
void *data;
size_t size;
size_t ulen;
// One of DB_DBT_XXX flags
uint32_t flags;
};
#endif

@ -0,0 +1,124 @@
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <string.h>
#include "../db.h"
#include "../portability/memory.h"
#include "../util/dbt.h"
typedef int (*ft_compare_func)(void *arg, const DBT *a, const DBT *b);
int toku_keycompare(const void *key1, size_t key1len, const void *key2,
size_t key2len);
int toku_builtin_compare_fun(const DBT *, const DBT *)
__attribute__((__visibility__("default")));
namespace toku {
// a comparator object encapsulates the data necessary for
// comparing two keys in a fractal tree. it further understands
// that points may be positive or negative infinity.
class comparator {
void init(ft_compare_func cmp, void *cmp_arg, uint8_t memcmp_magic) {
_cmp = cmp;
_cmp_arg = cmp_arg;
_memcmp_magic = memcmp_magic;
}
public:
// This magic value is reserved to mean that the magic has not been set.
static const uint8_t MEMCMP_MAGIC_NONE = 0;
void create(ft_compare_func cmp, void *cmp_arg,
uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) {
init(cmp, cmp_arg, memcmp_magic);
}
// inherit the attributes of another comparator, but keep our own
// copy of fake_db that is owned separately from the one given.
void inherit(const comparator &cmp) {
invariant_notnull(cmp._cmp);
init(cmp._cmp, cmp._cmp_arg, cmp._memcmp_magic);
}
// like inherit, but doesn't require that the this comparator
// was already created
void create_from(const comparator &cmp) { inherit(cmp); }
void destroy() {}
ft_compare_func get_compare_func() const { return _cmp; }
uint8_t get_memcmp_magic() const { return _memcmp_magic; }
bool valid() const { return _cmp != nullptr; }
inline bool dbt_has_memcmp_magic(const DBT *dbt) const {
return *reinterpret_cast<const char *>(dbt->data) == _memcmp_magic;
}
int operator()(const DBT *a, const DBT *b) const {
if (__builtin_expect(toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b),
0)) {
return toku_dbt_infinite_compare(a, b);
} else if (_memcmp_magic != MEMCMP_MAGIC_NONE
// If `a' has the memcmp magic..
&& dbt_has_memcmp_magic(a)
// ..then we expect `b' to also have the memcmp magic
&& __builtin_expect(dbt_has_memcmp_magic(b), 1)) {
assert(0); // psergey: this branch should not be taken.
return toku_builtin_compare_fun(a, b);
} else {
// yikes, const sadness here
return _cmp(_cmp_arg, a, b);
}
}
private:
ft_compare_func _cmp;
void *_cmp_arg;
uint8_t _memcmp_magic;
};
} /* namespace toku */

@ -0,0 +1,88 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include "../db.h"
#include "../portability/toku_race_tools.h"
#include "../util/status.h"
//
// Lock Tree Manager statistics
//
class LTM_STATUS_S {
public:
enum {
LTM_SIZE_CURRENT = 0,
LTM_SIZE_LIMIT,
LTM_ESCALATION_COUNT,
LTM_ESCALATION_TIME,
LTM_ESCALATION_LATEST_RESULT,
LTM_NUM_LOCKTREES,
LTM_LOCK_REQUESTS_PENDING,
LTM_STO_NUM_ELIGIBLE,
LTM_STO_END_EARLY_COUNT,
LTM_STO_END_EARLY_TIME,
LTM_WAIT_COUNT,
LTM_WAIT_TIME,
LTM_LONG_WAIT_COUNT,
LTM_LONG_WAIT_TIME,
LTM_TIMEOUT_COUNT,
LTM_WAIT_ESCALATION_COUNT,
LTM_WAIT_ESCALATION_TIME,
LTM_LONG_WAIT_ESCALATION_COUNT,
LTM_LONG_WAIT_ESCALATION_TIME,
LTM_STATUS_NUM_ROWS // must be last
};
void init(void);
void destroy(void);
TOKU_ENGINE_STATUS_ROW_S status[LTM_STATUS_NUM_ROWS];
private:
bool m_initialized = false;
};
typedef LTM_STATUS_S* LTM_STATUS;
extern LTM_STATUS_S ltm_status;
#define LTM_STATUS_VAL(x) ltm_status.status[LTM_STATUS_S::x].value.num
void toku_status_init(void); // just call ltm_status.init();
void toku_status_destroy(void); // just call ltm_status.destroy();

@ -0,0 +1,139 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "concurrent_tree.h"
// PORT #include <toku_assert.h>
namespace toku {
void concurrent_tree::create(const comparator *cmp) {
// start with an empty root node. we do this instead of
// setting m_root to null so there's always a root to lock
m_root.create_root(cmp);
}
void concurrent_tree::destroy(void) { m_root.destroy_root(); }
bool concurrent_tree::is_empty(void) { return m_root.is_empty(); }
uint64_t concurrent_tree::get_insertion_memory_overhead(void) {
return sizeof(treenode);
}
void concurrent_tree::locked_keyrange::prepare(concurrent_tree *tree) {
// the first step in acquiring a locked keyrange is locking the root
treenode *const root = &tree->m_root;
m_tree = tree;
m_subtree = root;
m_range = keyrange::get_infinite_range();
root->mutex_lock();
}
void concurrent_tree::locked_keyrange::acquire(const keyrange &range) {
treenode *const root = &m_tree->m_root;
treenode *subtree;
if (root->is_empty() || root->range_overlaps(range)) {
subtree = root;
} else {
// we do not have a precomputed comparison hint, so pass null
const keyrange::comparison *cmp_hint = nullptr;
subtree = root->find_node_with_overlapping_child(range, cmp_hint);
}
// subtree is locked. it will be unlocked when this is release()'d
invariant_notnull(subtree);
m_range = range;
m_subtree = subtree;
}
bool concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range,
TXNID new_owner) {
return m_subtree->insert(range, new_owner, /*is_shared*/ true);
}
void concurrent_tree::locked_keyrange::release(void) {
m_subtree->mutex_unlock();
}
void concurrent_tree::locked_keyrange::insert(const keyrange &range,
TXNID txnid, bool is_shared) {
// empty means no children, and only the root should ever be empty
if (m_subtree->is_empty()) {
m_subtree->set_range_and_txnid(range, txnid, is_shared);
} else {
m_subtree->insert(range, txnid, is_shared);
}
}
void concurrent_tree::locked_keyrange::remove(const keyrange &range,
TXNID txnid) {
invariant(!m_subtree->is_empty());
treenode *new_subtree = m_subtree->remove(range, txnid);
// if removing range changed the root of the subtree,
// then the subtree must be the root of the entire tree.
if (new_subtree == nullptr) {
invariant(m_subtree->is_root());
invariant(m_subtree->is_empty());
}
}
void concurrent_tree::locked_keyrange::remove_all(void) {
m_subtree->recursive_remove();
}
} /* namespace toku */
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,174 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include "../ft/comparator.h"
#include "keyrange.h"
#include "treenode.h"
namespace toku {
// A concurrent_tree stores non-overlapping ranges.
// Access to disjoint parts of the tree usually occurs concurrently.
class concurrent_tree {
public:
// A locked_keyrange gives you exclusive access to read and write
// operations that occur on any keys in that range. You only have
// the right to operate on keys in that range or keys that were read
// from the keyrange using iterate()
//
// Access model:
// - user prepares a locked keyrange. all threads serialize behind prepare().
// - user breaks the serialzation point by acquiring a range, or releasing.
// - one thread operates on a certain locked_keyrange object at a time.
// - when the thread is finished, it releases
class locked_keyrange {
public:
// effect: prepare to acquire a locked keyrange over the given
// concurrent_tree, preventing other threads from preparing
// until this thread either does acquire() or release().
// note: operations performed on a prepared keyrange are equivalent
// to ones performed on an acquired keyrange over -inf, +inf.
// rationale: this provides the user with a serialization point for
// descending
// or modifying the the tree. it also proives a convenient way of
// doing serializable operations on the tree.
// There are two valid sequences of calls:
// - prepare, acquire, [operations], release
// - prepare, [operations],release
void prepare(concurrent_tree *tree);
// requires: the locked keyrange was prepare()'d
// effect: acquire a locked keyrange over the given concurrent_tree.
// the locked keyrange represents the range of keys overlapped
// by the given range
void acquire(const keyrange &range);
// effect: releases a locked keyrange and the mutex it holds
void release(void);
// effect: iterate over each range this locked_keyrange represents,
// calling function->fn() on each node's keyrange and txnid
// until there are no more or the function returns false
template <class F>
void iterate(F *function) const {
// if the subtree is non-empty, traverse it by calling the given
// function on each range, txnid pair found that overlaps.
if (!m_subtree->is_empty()) {
m_subtree->traverse_overlaps(m_range, function);
}
}
// Adds another owner to the lock on the specified keyrange.
// requires: the keyrange contains one treenode whose bounds are
// exactly equal to the specifed range (no sub/supersets)
bool add_shared_owner(const keyrange &range, TXNID new_owner);
// inserts the given range into the tree, with an associated txnid.
// requires: range does not overlap with anything in this locked_keyrange
// rationale: caller is responsible for only inserting unique ranges
void insert(const keyrange &range, TXNID txnid, bool is_shared);
// effect: removes the given range from the tree.
// - txnid=TXNID_ANY means remove the range no matter what its
// owners are
// - Other value means remove the specified txnid from
// ownership (if the range has other owners, it will remain
// in the tree)
// requires: range exists exactly in this locked_keyrange
// rationale: caller is responsible for only removing existing ranges
void remove(const keyrange &range, TXNID txnid);
// effect: removes all of the keys represented by this locked keyrange
// rationale: we'd like a fast way to empty out a tree
void remove_all(void);
private:
// the concurrent tree this locked keyrange is for
concurrent_tree *m_tree;
// the range of keys this locked keyrange represents
keyrange m_range;
// the subtree under which all overlapping ranges exist
treenode *m_subtree;
friend class concurrent_tree_unit_test;
};
// effect: initialize the tree to an empty state
void create(const comparator *cmp);
// effect: destroy the tree.
// requires: tree is empty
void destroy(void);
// returns: true iff the tree is empty
bool is_empty(void);
// returns: the memory overhead of a single insertion into the tree
static uint64_t get_insertion_memory_overhead(void);
private:
// the root needs to always exist so there's a lock to grab
// even if the tree is empty. that's why we store a treenode
// here and not a pointer to one.
treenode m_root;
friend class concurrent_tree_unit_test;
};
} /* namespace toku */

@ -0,0 +1,221 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "keyrange.h"
#include "../util/dbt.h"
namespace toku {
// create a keyrange by borrowing the left and right dbt
// pointers. no memory is copied. no checks for infinity needed.
void keyrange::create(const DBT *left, const DBT *right) {
init_empty();
m_left_key = left;
m_right_key = right;
}
// destroy the key copies. if they were never set, then destroy does nothing.
void keyrange::destroy(void) {
toku_destroy_dbt(&m_left_key_copy);
toku_destroy_dbt(&m_right_key_copy);
}
// create a keyrange by copying the keys from the given range.
void keyrange::create_copy(const keyrange &range) {
// start with an initialized, empty range
init_empty();
// optimize the case where the left and right keys are the same.
// we'd like to only have one copy of the data.
if (toku_dbt_equals(range.get_left_key(), range.get_right_key())) {
set_both_keys(range.get_left_key());
} else {
// replace our empty left and right keys with
// copies of the range's left and right keys
replace_left_key(range.get_left_key());
replace_right_key(range.get_right_key());
}
}
// extend this keyrange by choosing the leftmost and rightmost
// endpoints between this range and the given. replaced keys
// in this range are freed and inherited keys are copied.
void keyrange::extend(const comparator &cmp, const keyrange &range) {
const DBT *range_left = range.get_left_key();
const DBT *range_right = range.get_right_key();
if (cmp(range_left, get_left_key()) < 0) {
replace_left_key(range_left);
}
if (cmp(range_right, get_right_key()) > 0) {
replace_right_key(range_right);
}
}
// how much memory does this keyrange take?
// - the size of the left and right keys
// --- ignore the fact that we may have optimized the point case.
// it complicates things for little gain.
// - the size of the keyrange class itself
uint64_t keyrange::get_memory_size(void) const {
const DBT *left_key = get_left_key();
const DBT *right_key = get_right_key();
return left_key->size + right_key->size + sizeof(keyrange);
}
// compare ranges.
keyrange::comparison keyrange::compare(const comparator &cmp,
const keyrange &range) const {
if (cmp(get_right_key(), range.get_left_key()) < 0) {
return comparison::LESS_THAN;
} else if (cmp(get_left_key(), range.get_right_key()) > 0) {
return comparison::GREATER_THAN;
} else if (cmp(get_left_key(), range.get_left_key()) == 0 &&
cmp(get_right_key(), range.get_right_key()) == 0) {
return comparison::EQUALS;
} else {
return comparison::OVERLAPS;
}
}
bool keyrange::overlaps(const comparator &cmp, const keyrange &range) const {
// equality is a stronger form of overlapping.
// so two ranges "overlap" if they're either equal or just overlapping.
comparison c = compare(cmp, range);
return c == comparison::EQUALS || c == comparison::OVERLAPS;
}
keyrange keyrange::get_infinite_range(void) {
keyrange range;
range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
return range;
}
void keyrange::init_empty(void) {
m_left_key = nullptr;
m_right_key = nullptr;
toku_init_dbt(&m_left_key_copy);
toku_init_dbt(&m_right_key_copy);
m_point_range = false;
}
const DBT *keyrange::get_left_key(void) const {
if (m_left_key) {
return m_left_key;
} else {
return &m_left_key_copy;
}
}
const DBT *keyrange::get_right_key(void) const {
if (m_right_key) {
return m_right_key;
} else {
return &m_right_key_copy;
}
}
// copy the given once and set both the left and right pointers.
// optimization for point ranges, so the left and right ranges
// are not copied twice.
void keyrange::set_both_keys(const DBT *key) {
if (toku_dbt_is_infinite(key)) {
m_left_key = key;
m_right_key = key;
} else {
toku_clone_dbt(&m_left_key_copy, *key);
toku_copyref_dbt(&m_right_key_copy, m_left_key_copy);
}
m_point_range = true;
}
// destroy the current left key. set and possibly copy the new one
void keyrange::replace_left_key(const DBT *key) {
// a little magic:
//
// if this is a point range, then the left and right keys share
// one copy of the data, and it lives in the left key copy. so
// if we're replacing the left key, move the real data to the
// right key copy instead of destroying it. now, the memory is
// owned by the right key and the left key may be replaced.
if (m_point_range) {
m_right_key_copy = m_left_key_copy;
} else {
toku_destroy_dbt(&m_left_key_copy);
}
if (toku_dbt_is_infinite(key)) {
m_left_key = key;
} else {
toku_clone_dbt(&m_left_key_copy, *key);
m_left_key = nullptr;
}
m_point_range = false;
}
// destroy the current right key. set and possibly copy the new one
void keyrange::replace_right_key(const DBT *key) {
toku_destroy_dbt(&m_right_key_copy);
if (toku_dbt_is_infinite(key)) {
m_right_key = key;
} else {
toku_clone_dbt(&m_right_key_copy, *key);
m_right_key = nullptr;
}
m_point_range = false;
}
} /* namespace toku */
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,140 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include "../ft/comparator.h"
namespace toku {
// A keyrange has a left and right key as endpoints.
//
// When a keyrange is created it owns no memory, but when it copies
// or extends another keyrange, it copies memory as necessary. This
// means it is cheap in the common case.
class keyrange {
public:
// effect: constructor that borrows left and right key pointers.
// no memory is allocated or copied.
void create(const DBT *left_key, const DBT *right_key);
// effect: constructor that allocates and copies another keyrange's points.
void create_copy(const keyrange &range);
// effect: destroys the keyrange, freeing any allocated memory
void destroy(void);
// effect: extends the keyrange by choosing the leftmost and rightmost
// endpoints from this range and the given range.
// replaced keys in this range are freed, new keys are copied.
void extend(const comparator &cmp, const keyrange &range);
// returns: the amount of memory this keyrange takes. does not account
// for point optimizations or malloc overhead.
uint64_t get_memory_size(void) const;
// returns: pointer to the left key of this range
const DBT *get_left_key(void) const;
// returns: pointer to the right key of this range
const DBT *get_right_key(void) const;
// two ranges are either equal, lt, gt, or overlapping
enum comparison { EQUALS, LESS_THAN, GREATER_THAN, OVERLAPS };
// effect: compares this range to the given range
// returns: LESS_THAN if given range is strictly to the left
// GREATER_THAN if given range is strictly to the right
// EQUALS if given range has the same left and right endpoints
// OVERLAPS if at least one of the given range's endpoints falls
// between this range's endpoints
comparison compare(const comparator &cmp, const keyrange &range) const;
// returns: true if the range and the given range are equal or overlapping
bool overlaps(const comparator &cmp, const keyrange &range) const;
// returns: a keyrange representing -inf, +inf
static keyrange get_infinite_range(void);
private:
// some keys should be copied, some keys should not be.
//
// to support both, we use two DBTs for copies and two pointers
// for temporaries. the access rule is:
// - if a pointer is non-null, then it reprsents the key.
// - otherwise the pointer is null, and the key is in the copy.
DBT m_left_key_copy;
DBT m_right_key_copy;
const DBT *m_left_key;
const DBT *m_right_key;
// if this range is a point range, then m_left_key == m_right_key
// and the actual data is stored exactly once in m_left_key_copy.
bool m_point_range;
// effect: initializes a keyrange to be empty
void init_empty(void);
// effect: copies the given key once into the left key copy
// and sets the right key copy to share the left.
// rationale: optimization for point ranges to only do one malloc
void set_both_keys(const DBT *key);
// effect: destroys the current left key. sets and copies the new one.
void replace_left_key(const DBT *key);
// effect: destroys the current right key. sets and copies the new one.
void replace_right_key(const DBT *key);
};
} /* namespace toku */

@ -0,0 +1,534 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "lock_request.h"
#include "../portability/toku_race_tools.h"
#include "../portability/txn_subst.h"
#include "../util/dbt.h"
#include "locktree.h"
namespace toku {
// initialize a lock request's internals
void lock_request::create(toku_external_mutex_factory_t mutex_factory) {
m_txnid = TXNID_NONE;
m_conflicting_txnid = TXNID_NONE;
m_start_time = 0;
m_left_key = nullptr;
m_right_key = nullptr;
toku_init_dbt(&m_left_key_copy);
toku_init_dbt(&m_right_key_copy);
m_type = type::UNKNOWN;
m_lt = nullptr;
m_complete_r = 0;
m_state = state::UNINITIALIZED;
m_info = nullptr;
// psergey-todo: this condition is for interruptible wait
// note: moved to here from lock_request::create:
toku_external_cond_init(mutex_factory, &m_wait_cond);
m_start_test_callback = nullptr;
m_start_before_pending_test_callback = nullptr;
m_retry_test_callback = nullptr;
}
// destroy a lock request.
void lock_request::destroy(void) {
invariant(m_state != state::PENDING);
invariant(m_state != state::DESTROYED);
m_state = state::DESTROYED;
toku_destroy_dbt(&m_left_key_copy);
toku_destroy_dbt(&m_right_key_copy);
toku_external_cond_destroy(&m_wait_cond);
}
// set the lock request parameters. this API allows a lock request to be reused.
void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key,
const DBT *right_key, lock_request::type lock_type,
bool big_txn, void *extra) {
invariant(m_state != state::PENDING);
m_lt = lt;
m_txnid = txnid;
m_left_key = left_key;
m_right_key = right_key;
toku_destroy_dbt(&m_left_key_copy);
toku_destroy_dbt(&m_right_key_copy);
m_type = lock_type;
m_state = state::INITIALIZED;
m_info = lt ? lt->get_lock_request_info() : nullptr;
m_big_txn = big_txn;
m_extra = extra;
}
// get rid of any stored left and right key copies and
// replace them with copies of the given left and right key
void lock_request::copy_keys() {
if (!toku_dbt_is_infinite(m_left_key)) {
toku_clone_dbt(&m_left_key_copy, *m_left_key);
m_left_key = &m_left_key_copy;
}
if (!toku_dbt_is_infinite(m_right_key)) {
toku_clone_dbt(&m_right_key_copy, *m_right_key);
m_right_key = &m_right_key_copy;
}
}
// what are the conflicts for this pending lock request?
void lock_request::get_conflicts(txnid_set *conflicts) {
invariant(m_state == state::PENDING);
const bool is_write_request = m_type == type::WRITE;
m_lt->get_conflicts(is_write_request, m_txnid, m_left_key, m_right_key,
conflicts);
}
// build a wait-for-graph for this lock request and the given conflict set
// for each transaction B that blocks A's lock request
// if B is blocked then
// add (A,T) to the WFG and if B is new, fill in the WFG from B
void lock_request::build_wait_graph(wfg *wait_graph,
const txnid_set &conflicts) {
uint32_t num_conflicts = conflicts.size();
for (uint32_t i = 0; i < num_conflicts; i++) {
TXNID conflicting_txnid = conflicts.get(i);
lock_request *conflicting_request = find_lock_request(conflicting_txnid);
invariant(conflicting_txnid != m_txnid);
invariant(conflicting_request != this);
if (conflicting_request) {
bool already_exists = wait_graph->node_exists(conflicting_txnid);
wait_graph->add_edge(m_txnid, conflicting_txnid);
if (!already_exists) {
// recursively build the wait for graph rooted at the conflicting
// request, given its set of lock conflicts.
txnid_set other_conflicts;
other_conflicts.create();
conflicting_request->get_conflicts(&other_conflicts);
conflicting_request->build_wait_graph(wait_graph, other_conflicts);
other_conflicts.destroy();
}
}
}
}
// returns: true if the current set of lock requests contains
// a deadlock, false otherwise.
bool lock_request::deadlock_exists(const txnid_set &conflicts) {
wfg wait_graph;
wait_graph.create();
build_wait_graph(&wait_graph, conflicts);
std::function<void(TXNID)> reporter;
if (m_deadlock_cb) {
reporter = [this](TXNID a) {
lock_request *req = find_lock_request(a);
if (req) {
m_deadlock_cb(req->m_txnid, (req->m_type == lock_request::WRITE),
req->m_left_key, req->m_right_key);
}
};
}
bool deadlock = wait_graph.cycle_exists_from_txnid(m_txnid, reporter);
wait_graph.destroy();
return deadlock;
}
// try to acquire a lock described by this lock request.
int lock_request::start(void) {
int r;
txnid_set conflicts;
conflicts.create();
if (m_type == type::WRITE) {
r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
m_big_txn);
} else {
invariant(m_type == type::READ);
r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
m_big_txn);
}
// if the lock is not granted, save it to the set of lock requests
// and check for a deadlock. if there is one, complete it as failed
if (r == DB_LOCK_NOTGRANTED) {
copy_keys();
m_state = state::PENDING;
m_start_time = toku_current_time_microsec() / 1000;
m_conflicting_txnid = conflicts.get(0);
if (m_start_before_pending_test_callback)
m_start_before_pending_test_callback();
toku_external_mutex_lock(&m_info->mutex);
insert_into_lock_requests();
if (deadlock_exists(conflicts)) {
remove_from_lock_requests();
r = DB_LOCK_DEADLOCK;
}
toku_external_mutex_unlock(&m_info->mutex);
if (m_start_test_callback) m_start_test_callback(); // test callback
}
if (r != DB_LOCK_NOTGRANTED) {
complete(r);
}
conflicts.destroy();
return r;
}
// sleep on the lock request until it becomes resolved or the wait time has
// elapsed.
int lock_request::wait(uint64_t wait_time_ms) {
return wait(wait_time_ms, 0, nullptr);
}
int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
int (*killed_callback)(void),
void (*lock_wait_callback)(void *, TXNID, TXNID),
void *callback_arg) {
uint64_t t_now = toku_current_time_microsec();
uint64_t t_start = t_now;
uint64_t t_end = t_start + wait_time_ms * 1000;
toku_external_mutex_lock(&m_info->mutex);
// check again, this time locking out other retry calls
if (m_state == state::PENDING) {
GrowableArray<TXNID> conflicts_collector;
conflicts_collector.init();
retry(&conflicts_collector);
if (m_state == state::PENDING) {
report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
}
conflicts_collector.deinit();
}
while (m_state == state::PENDING) {
// check if this thread is killed
if (killed_callback && killed_callback()) {
remove_from_lock_requests();
complete(DB_LOCK_NOTGRANTED);
continue;
}
// compute the time until we should wait
uint64_t t_wait;
if (killed_time_ms == 0) {
t_wait = t_end;
} else {
t_wait = t_now + killed_time_ms * 1000;
if (t_wait > t_end) t_wait = t_end;
}
int r = toku_external_cond_timedwait(&m_wait_cond, &m_info->mutex,
(int64_t)(t_wait - t_now));
invariant(r == 0 || r == ETIMEDOUT);
t_now = toku_current_time_microsec();
if (m_state == state::PENDING && (t_now >= t_end)) {
m_info->counters.timeout_count += 1;
// if we're still pending and we timed out, then remove our
// request from the set of lock requests and fail.
remove_from_lock_requests();
// complete sets m_state to COMPLETE, breaking us out of the loop
complete(DB_LOCK_NOTGRANTED);
}
}
uint64_t t_real_end = toku_current_time_microsec();
uint64_t duration = t_real_end - t_start;
m_info->counters.wait_count += 1;
m_info->counters.wait_time += duration;
if (duration >= 1000000) {
m_info->counters.long_wait_count += 1;
m_info->counters.long_wait_time += duration;
}
toku_external_mutex_unlock(&m_info->mutex);
invariant(m_state == state::COMPLETE);
return m_complete_r;
}
// complete this lock request with the given return value
void lock_request::complete(int complete_r) {
m_complete_r = complete_r;
m_state = state::COMPLETE;
}
const DBT *lock_request::get_left_key(void) const { return m_left_key; }
const DBT *lock_request::get_right_key(void) const { return m_right_key; }
TXNID lock_request::get_txnid(void) const { return m_txnid; }
uint64_t lock_request::get_start_time(void) const { return m_start_time; }
TXNID lock_request::get_conflicting_txnid(void) const {
return m_conflicting_txnid;
}
int lock_request::retry(GrowableArray<TXNID> *conflicts_collector) {
invariant(m_state == state::PENDING);
int r;
txnid_set conflicts;
conflicts.create();
if (m_type == type::WRITE) {
r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
m_big_txn);
} else {
r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
m_big_txn);
}
// if the acquisition succeeded then remove ourselves from the
// set of lock requests, complete, and signal the waiting thread.
if (r == 0) {
remove_from_lock_requests();
complete(r);
if (m_retry_test_callback) m_retry_test_callback(); // test callback
toku_external_cond_broadcast(&m_wait_cond);
} else {
m_conflicting_txnid = conflicts.get(0);
add_conflicts_to_waits(&conflicts, conflicts_collector);
}
conflicts.destroy();
return r;
}
void lock_request::retry_all_lock_requests(
locktree *lt, void (*lock_wait_callback)(void *, TXNID, TXNID),
void *callback_arg, void (*after_retry_all_test_callback)(void)) {
lt_lock_request_info *info = lt->get_lock_request_info();
// if there are no pending lock requests than there is nothing to do
// the unlocked data race on pending_is_empty is OK since lock requests
// are retried after added to the pending set.
if (info->pending_is_empty) return;
// get my retry generation (post increment of retry_want)
unsigned long long my_retry_want = (info->retry_want += 1);
toku_mutex_lock(&info->retry_mutex);
GrowableArray<TXNID> conflicts_collector;
conflicts_collector.init();
// here is the group retry algorithm.
// get the latest retry_want count and use it as the generation number of
// this retry operation. if this retry generation is > the last retry
// generation, then do the lock retries. otherwise, no lock retries
// are needed.
if ((my_retry_want - 1) == info->retry_done) {
for (;;) {
if (!info->running_retry) {
info->running_retry = true;
info->retry_done = info->retry_want;
toku_mutex_unlock(&info->retry_mutex);
retry_all_lock_requests_info(info, &conflicts_collector);
if (after_retry_all_test_callback) after_retry_all_test_callback();
toku_mutex_lock(&info->retry_mutex);
info->running_retry = false;
toku_cond_broadcast(&info->retry_cv);
break;
} else {
toku_cond_wait(&info->retry_cv, &info->retry_mutex);
}
}
}
toku_mutex_unlock(&info->retry_mutex);
report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
conflicts_collector.deinit();
}
void lock_request::retry_all_lock_requests_info(
lt_lock_request_info *info, GrowableArray<TXNID> *collector) {
toku_external_mutex_lock(&info->mutex);
// retry all of the pending lock requests.
for (uint32_t i = 0; i < info->pending_lock_requests.size();) {
lock_request *request;
int r = info->pending_lock_requests.fetch(i, &request);
invariant_zero(r);
// retry the lock request. if it didn't succeed,
// move on to the next lock request. otherwise
// the request is gone from the list so we may
// read the i'th entry for the next one.
r = request->retry(collector);
if (r != 0) {
i++;
}
}
// future threads should only retry lock requests if some still exist
info->should_retry_lock_requests = info->pending_lock_requests.size() > 0;
toku_external_mutex_unlock(&info->mutex);
}
void lock_request::add_conflicts_to_waits(
txnid_set *conflicts, GrowableArray<TXNID> *wait_conflicts) {
uint32_t num_conflicts = conflicts->size();
for (uint32_t i = 0; i < num_conflicts; i++) {
wait_conflicts->push(m_txnid);
wait_conflicts->push(conflicts->get(i));
}
}
void lock_request::report_waits(GrowableArray<TXNID> *wait_conflicts,
void (*lock_wait_callback)(void *, TXNID,
TXNID),
void *callback_arg) {
if (!lock_wait_callback) return;
size_t num_conflicts = wait_conflicts->get_size();
for (size_t i = 0; i < num_conflicts; i += 2) {
TXNID blocked_txnid = wait_conflicts->fetch_unchecked(i);
TXNID blocking_txnid = wait_conflicts->fetch_unchecked(i + 1);
(*lock_wait_callback)(callback_arg, blocked_txnid, blocking_txnid);
}
}
void *lock_request::get_extra(void) const { return m_extra; }
void lock_request::kill_waiter(void) {
remove_from_lock_requests();
complete(DB_LOCK_NOTGRANTED);
toku_external_cond_broadcast(&m_wait_cond);
}
void lock_request::kill_waiter(locktree *lt, void *extra) {
lt_lock_request_info *info = lt->get_lock_request_info();
toku_external_mutex_lock(&info->mutex);
for (uint32_t i = 0; i < info->pending_lock_requests.size(); i++) {
lock_request *request;
int r = info->pending_lock_requests.fetch(i, &request);
if (r == 0 && request->get_extra() == extra) {
request->kill_waiter();
break;
}
}
toku_external_mutex_unlock(&info->mutex);
}
// find another lock request by txnid. must hold the mutex.
lock_request *lock_request::find_lock_request(const TXNID &txnid) {
lock_request *request;
int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
txnid, &request, nullptr);
if (r != 0) {
request = nullptr;
}
return request;
}
// insert this lock request into the locktree's set. must hold the mutex.
void lock_request::insert_into_lock_requests(void) {
uint32_t idx;
lock_request *request;
int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
m_txnid, &request, &idx);
invariant(r == DB_NOTFOUND);
r = m_info->pending_lock_requests.insert_at(this, idx);
invariant_zero(r);
m_info->pending_is_empty = false;
}
// remove this lock request from the locktree's set. must hold the mutex.
void lock_request::remove_from_lock_requests(void) {
uint32_t idx;
lock_request *request;
int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
m_txnid, &request, &idx);
invariant_zero(r);
invariant(request == this);
r = m_info->pending_lock_requests.delete_at(idx);
invariant_zero(r);
if (m_info->pending_lock_requests.size() == 0)
m_info->pending_is_empty = true;
}
int lock_request::find_by_txnid(lock_request *const &request,
const TXNID &txnid) {
TXNID request_txnid = request->m_txnid;
if (request_txnid < txnid) {
return -1;
} else if (request_txnid == txnid) {
return 0;
} else {
return 1;
}
}
void lock_request::set_start_test_callback(void (*f)(void)) {
m_start_test_callback = f;
}
void lock_request::set_start_before_pending_test_callback(void (*f)(void)) {
m_start_before_pending_test_callback = f;
}
void lock_request::set_retry_test_callback(void (*f)(void)) {
m_retry_test_callback = f;
}
} /* namespace toku */
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,238 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include "../db.h"
#include "../ft/comparator.h"
#include "../portability/toku_pthread.h"
#include "locktree.h"
#include "txnid_set.h"
#include "wfg.h"
namespace toku {
// A lock request contains the db, the key range, the lock type, and
// the transaction id that describes a potential row range lock.
//
// the typical use case is:
// - initialize a lock request
// - start to try to acquire the lock
// - do something else
// - wait for the lock request to be resolved on a timed condition
// - destroy the lock request
// a lock request is resolved when its state is no longer pending, or
// when it becomes granted, or timedout, or deadlocked. when resolved, the
// state of the lock request is changed and any waiting threads are awakened.
class lock_request {
public:
enum type { UNKNOWN, READ, WRITE };
// effect: Initializes a lock request.
void create(toku_external_mutex_factory_t mutex_factory);
// effect: Destroys a lock request.
void destroy(void);
// effect: Resets the lock request parameters, allowing it to be reused.
// requires: Lock request was already created at some point
void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key,
type lock_type, bool big_txn, void *extra = nullptr);
// effect: Tries to acquire a lock described by this lock request.
// returns: The return code of locktree::acquire_[write,read]_lock()
// or DB_LOCK_DEADLOCK if this request would end up deadlocked.
int start(void);
// effect: Sleeps until either the request is granted or the wait time
// expires. returns: The return code of locktree::acquire_[write,read]_lock()
// or simply DB_LOCK_NOTGRANTED if the wait time expired.
int wait(uint64_t wait_time_ms);
int wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
int (*killed_callback)(void),
void (*lock_wait_callback)(void *, TXNID, TXNID) = nullptr,
void *callback_arg = nullptr);
// return: left end-point of the lock range
const DBT *get_left_key(void) const;
// return: right end-point of the lock range
const DBT *get_right_key(void) const;
// return: the txnid waiting for a lock
TXNID get_txnid(void) const;
// return: when this lock request started, as milliseconds from epoch
uint64_t get_start_time(void) const;
// return: which txnid is blocking this request (there may be more, though)
TXNID get_conflicting_txnid(void) const;
// effect: Retries all of the lock requests for the given locktree.
// Any lock requests successfully restarted is completed and woken
// up.
// The rest remain pending.
static void retry_all_lock_requests(
locktree *lt, void (*lock_wait_callback)(void *, TXNID, TXNID) = nullptr,
void *callback_arg = nullptr,
void (*after_retry_test_callback)(void) = nullptr);
static void retry_all_lock_requests_info(lt_lock_request_info *info,
GrowableArray<TXNID> *collector);
void set_start_test_callback(void (*f)(void));
void set_start_before_pending_test_callback(void (*f)(void));
void set_retry_test_callback(void (*f)(void));
void *get_extra(void) const;
void kill_waiter(void);
static void kill_waiter(locktree *lt, void *extra);
private:
enum state {
UNINITIALIZED,
INITIALIZED,
PENDING,
COMPLETE,
DESTROYED,
};
// The keys for a lock request are stored "unowned" in m_left_key
// and m_right_key. When the request is about to go to sleep, it
// copies these keys and stores them in m_left_key_copy etc and
// sets the temporary pointers to null.
TXNID m_txnid;
TXNID m_conflicting_txnid;
uint64_t m_start_time;
const DBT *m_left_key;
const DBT *m_right_key;
DBT m_left_key_copy;
DBT m_right_key_copy;
// The lock request type and associated locktree
type m_type;
locktree *m_lt;
// If the lock request is in the completed state, then its
// final return value is stored in m_complete_r
int m_complete_r;
state m_state;
toku_external_cond_t m_wait_cond;
bool m_big_txn;
// the lock request info state stored in the
// locktree that this lock request is for.
struct lt_lock_request_info *m_info;
void *m_extra;
// effect: tries again to acquire the lock described by this lock request
// returns: 0 if retrying the request succeeded and is now complete
int retry(GrowableArray<TXNID> *conflict_collector);
void complete(int complete_r);
// effect: Finds another lock request by txnid.
// requires: The lock request info mutex is held
lock_request *find_lock_request(const TXNID &txnid);
// effect: Insert this lock request into the locktree's set.
// requires: the locktree's mutex is held
void insert_into_lock_requests(void);
// effect: Removes this lock request from the locktree's set.
// requires: The lock request info mutex is held
void remove_from_lock_requests(void);
// effect: Asks this request's locktree which txnids are preventing
// us from getting the lock described by this request.
// returns: conflicts is populated with the txnid's that this request
// is blocked on
void get_conflicts(txnid_set *conflicts);
// effect: Builds a wait-for-graph for this lock request and the given
// conflict set
void build_wait_graph(wfg *wait_graph, const txnid_set &conflicts);
// returns: True if this lock request is in deadlock with the given conflicts
// set
bool deadlock_exists(const txnid_set &conflicts);
void copy_keys(void);
static int find_by_txnid(lock_request *const &request, const TXNID &txnid);
// Report list of conflicts to lock wait callback.
static void report_waits(GrowableArray<TXNID> *wait_conflicts,
void (*lock_wait_callback)(void *, TXNID, TXNID),
void *callback_arg);
void add_conflicts_to_waits(txnid_set *conflicts,
GrowableArray<TXNID> *wait_conflicts);
void (*m_start_test_callback)(void);
void (*m_start_before_pending_test_callback)(void);
void (*m_retry_test_callback)(void);
public:
std::function<void(TXNID, bool, const DBT *, const DBT *)> m_deadlock_cb;
friend class lock_request_unit_test;
};
// PORT: lock_request is not a POD anymore due to use of toku_external_cond_t
// This is ok as the PODness is not really required: lock_request objects are
// not moved in memory or anything.
// ENSURE_POD(lock_request);
} /* namespace toku */

@ -0,0 +1,559 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <atomic>
#include "../db.h"
#include "../ft/comparator.h"
#include "../portability/toku_external_pthread.h"
#include "../portability/toku_pthread.h"
#include "../portability/toku_time.h"
// PORT #include <ft/ft-ops.h> // just for DICTIONARY_ID..
// PORT: ft-status for LTM_STATUS:
#include "../ft/ft-status.h"
struct DICTIONARY_ID {
uint64_t dictid;
};
#include "../util/omt.h"
#include "range_buffer.h"
#include "txnid_set.h"
#include "wfg.h"
namespace toku {
class locktree;
class locktree_manager;
class lock_request;
class concurrent_tree;
typedef int (*lt_create_cb)(locktree *lt, void *extra);
typedef void (*lt_destroy_cb)(locktree *lt);
typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt,
const range_buffer &buffer, void *extra);
struct lt_counters {
uint64_t wait_count, wait_time;
uint64_t long_wait_count, long_wait_time;
uint64_t timeout_count;
void add(const lt_counters &rhs) {
wait_count += rhs.wait_count;
wait_time += rhs.wait_time;
long_wait_count += rhs.long_wait_count;
long_wait_time += rhs.long_wait_time;
timeout_count += rhs.timeout_count;
}
};
// Lock request state for some locktree
struct lt_lock_request_info {
omt<lock_request *> pending_lock_requests;
std::atomic_bool pending_is_empty;
toku_external_mutex_t mutex;
bool should_retry_lock_requests;
lt_counters counters;
std::atomic_ullong retry_want;
unsigned long long retry_done;
toku_mutex_t retry_mutex;
toku_cond_t retry_cv;
bool running_retry;
void init(toku_external_mutex_factory_t mutex_factory);
void destroy(void);
};
// The locktree manager manages a set of locktrees, one for each open
// dictionary. Locktrees are retrieved from the manager. When they are no
// longer needed, they are be released by the user.
class locktree_manager {
public:
// param: create_cb, called just after a locktree is first created.
// destroy_cb, called just before a locktree is destroyed.
// escalate_cb, called after a locktree is escalated (with extra
// param)
void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
lt_escalate_cb escalate_cb, void *extra,
toku_external_mutex_factory_t mutex_factory_arg);
void destroy(void);
size_t get_max_lock_memory(void);
int set_max_lock_memory(size_t max_lock_memory);
// effect: Get a locktree from the manager. If a locktree exists with the
// given
// dict_id, it is referenced and then returned. If one did not exist,
// it is created. It will use the comparator for comparing keys. The
// on_create callback (passed to locktree_manager::create()) will be
// called with the given extra parameter.
locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
void *on_create_extra);
void reference_lt(locktree *lt);
// effect: Releases one reference on a locktree. If the reference count
// transitions
// to zero, the on_destroy callback is called before it gets
// destroyed.
void release_lt(locktree *lt);
void get_status(LTM_STATUS status);
// effect: calls the iterate function on each pending lock request
// note: holds the manager's mutex
typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
TXNID txnid, const DBT *left_key,
const DBT *right_key,
TXNID blocking_txnid,
uint64_t start_time,
void *extra);
int iterate_pending_lock_requests(lock_request_iterate_callback cb,
void *extra);
// effect: Determines if too many locks or too much memory is being used,
// Runs escalation on the manager if so.
// param: big_txn, if the current transaction is 'big' (has spilled rollback
// logs) returns: 0 if there enough resources to create a new lock, or
// TOKUDB_OUT_OF_LOCKS
// if there are not enough resources and lock escalation failed to
// free up enough resources for a new lock.
int check_current_lock_constraints(bool big_txn);
bool over_big_threshold(void);
void note_mem_used(uint64_t mem_used);
void note_mem_released(uint64_t mem_freed);
bool out_of_locks(void) const;
// Escalate all locktrees
void escalate_all_locktrees(void);
// Escalate a set of locktrees
void escalate_locktrees(locktree **locktrees, int num_locktrees);
// effect: calls the private function run_escalation(), only ok to
// do for tests.
// rationale: to get better stress test coverage, we want a way to
// deterministicly trigger lock escalation.
void run_escalation_for_test(void);
void run_escalation(void);
// Add time t to the escalator's wait time statistics
void add_escalator_wait_time(uint64_t t);
void kill_waiter(void *extra);
private:
static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
// tracks the current number of locks and lock memory
uint64_t m_max_lock_memory;
uint64_t m_current_lock_memory;
struct lt_counters m_lt_counters;
// the create and destroy callbacks for the locktrees
lt_create_cb m_lt_create_callback;
lt_destroy_cb m_lt_destroy_callback;
lt_escalate_cb m_lt_escalate_callback;
void *m_lt_escalate_callback_extra;
omt<locktree *> m_locktree_map;
toku_external_mutex_factory_t mutex_factory;
// the manager's mutex protects the locktree map
toku_mutex_t m_mutex;
void mutex_lock(void);
void mutex_unlock(void);
// Manage the set of open locktrees
locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
void locktree_map_put(locktree *lt);
void locktree_map_remove(locktree *lt);
static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
void escalator_init(void);
void escalator_destroy(void);
// statistics about lock escalation.
toku_mutex_t m_escalation_mutex;
uint64_t m_escalation_count;
tokutime_t m_escalation_time;
uint64_t m_escalation_latest_result;
uint64_t m_wait_escalation_count;
uint64_t m_wait_escalation_time;
uint64_t m_long_wait_escalation_count;
uint64_t m_long_wait_escalation_time;
// the escalator coordinates escalation on a set of locktrees for a bunch of
// threads
class locktree_escalator {
public:
void create(void);
void destroy(void);
void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
void *extra);
private:
toku_mutex_t m_escalator_mutex;
toku_cond_t m_escalator_done;
bool m_escalator_running;
};
locktree_escalator m_escalator;
friend class manager_unit_test;
};
// A locktree represents the set of row locks owned by all transactions
// over an open dictionary. Read and write ranges are represented as
// a left and right key which are compared with the given comparator
//
// Locktrees are not created and destroyed by the user. Instead, they are
// referenced and released using the locktree manager.
//
// A sample workflow looks like this:
// - Create a manager.
// - Get a locktree by dictionaroy id from the manager.
// - Perform read/write lock acquision on the locktree, add references to
// the locktree using the manager, release locks, release references, etc.
// - ...
// - Release the final reference to the locktree. It will be destroyed.
// - Destroy the manager.
class locktree {
public:
// effect: Creates a locktree
void create(locktree_manager *mgr, DICTIONARY_ID dict_id,
const comparator &cmp,
toku_external_mutex_factory_t mutex_factory);
void destroy(void);
// For thread-safe, external reference counting
void add_reference(void);
// requires: the reference count is > 0
// returns: the reference count, after decrementing it by one
uint32_t release_reference(void);
// returns: the current reference count
uint32_t get_reference_count(void);
// effect: Attempts to grant a read lock for the range of keys between
// [left_key, right_key]. returns: If the lock cannot be granted, return
// DB_LOCK_NOTGRANTED, and populate the
// given conflicts set with the txnids that hold conflicting locks in
// the range. If the locktree cannot create more locks, return
// TOKUDB_OUT_OF_LOCKS.
// note: Read locks cannot be shared between txnids, as one would expect.
// This is for simplicity since read locks are rare in MySQL.
int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
txnid_set *conflicts, bool big_txn);
// effect: Attempts to grant a write lock for the range of keys between
// [left_key, right_key]. returns: If the lock cannot be granted, return
// DB_LOCK_NOTGRANTED, and populate the
// given conflicts set with the txnids that hold conflicting locks in
// the range. If the locktree cannot create more locks, return
// TOKUDB_OUT_OF_LOCKS.
int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
txnid_set *conflicts, bool big_txn);
// effect: populate the conflicts set with the txnids that would preventing
// the given txnid from getting a lock on [left_key, right_key]
void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key,
const DBT *right_key, txnid_set *conflicts);
// effect: Release all of the lock ranges represented by the range buffer for
// a txnid.
void release_locks(TXNID txnid, const range_buffer *ranges,
bool all_trx_locks_hint = false);
// effect: Runs escalation on this locktree
void escalate(lt_escalate_cb after_escalate_callback, void *extra);
// returns: The userdata associated with this locktree, or null if it has not
// been set.
void *get_userdata(void) const;
void set_userdata(void *userdata);
locktree_manager *get_manager(void) const;
void set_comparator(const comparator &cmp);
int compare(const locktree *lt) const;
DICTIONARY_ID get_dict_id() const;
// Private info struct for storing pending lock request state.
// Only to be used by lock requests. We store it here as
// something less opaque than usual to strike a tradeoff between
// abstraction and code complexity. It is still fairly abstract
// since the lock_request object is opaque
struct lt_lock_request_info *get_lock_request_info(void);
typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
TXNID txnid, bool is_shared,
TxnidVector *owners);
void dump_locks(void *cdata, dump_callback cb);
private:
locktree_manager *m_mgr;
DICTIONARY_ID m_dict_id;
uint32_t m_reference_count;
// Since the memory referenced by this comparator is not owned by the
// locktree, the user must guarantee it will outlive the locktree.
//
// The ydb API accomplishes this by opening an ft_handle in the on_create
// callback, which will keep the underlying FT (and its descriptor) in memory
// for as long as the handle is open. The ft_handle is stored opaquely in the
// userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
comparator m_cmp;
concurrent_tree *m_rangetree;
void *m_userdata;
struct lt_lock_request_info m_lock_request_info;
// psergey-todo:
// Each transaction also keeps a list of ranges it has locked.
// So, when a transaction is running in STO mode, two identical
// lists are kept: the STO lock list and transaction's owned locks
// list. Why can't we do with just one list?
// The following fields and members prefixed with "sto_" are for
// the single txnid optimization, intended to speed up the case
// when only one transaction is using the locktree. If we know
// the locktree has only one transaction, then acquiring locks
// takes O(1) work and releasing all locks takes O(1) work.
//
// How do we know that the locktree only has a single txnid?
// What do we do if it does?
//
// When a txn with txnid T requests a lock:
// - If the tree is empty, the optimization is possible. Set the single
// txnid to T, and insert the lock range into the buffer.
// - If the tree is not empty, check if the single txnid is T. If so,
// append the lock range to the buffer. Otherwise, migrate all of
// the locks in the buffer into the rangetree on behalf of txnid T,
// and invalid the single txnid.
//
// When a txn with txnid T releases its locks:
// - If the single txnid is valid, it must be for T. Destroy the buffer.
// - If it's not valid, release locks the normal way in the rangetree.
//
// To carry out the optimization we need to record a single txnid
// and a range buffer for each locktree, each protected by the root
// lock of the locktree's rangetree. The root lock for a rangetree
// is grabbed by preparing a locked keyrange on the rangetree.
TXNID m_sto_txnid;
range_buffer m_sto_buffer;
// The single txnid optimization speeds up the case when only one
// transaction is using the locktree. But it has the potential to
// hurt the case when more than one txnid exists.
//
// There are two things we need to do to make the optimization only
// optimize the case we care about, and not hurt the general case.
//
// Bound the worst-case latency for lock migration when the
// optimization stops working:
// - Idea: Stop the optimization and migrate immediate if we notice
// the single txnid has takes many locks in the range buffer.
// - Implementation: Enforce a max size on the single txnid range buffer.
// - Analysis: Choosing the perfect max value, M, is difficult to do
// without some feedback from the field. Intuition tells us that M should
// not be so small that the optimization is worthless, and it should not
// be so big that it's unreasonable to have to wait behind a thread doing
// the work of converting M buffer locks into rangetree locks.
//
// Prevent concurrent-transaction workloads from trying the optimization
// in vain:
// - Idea: Don't even bother trying the optimization if we think the
// system is in a concurrent-transaction state.
// - Implementation: Do something even simpler than detecting whether the
// system is in a concurent-transaction state. Just keep a "score" value
// and some threshold. If at any time the locktree is eligible for the
// optimization, only do it if the score is at this threshold. When you
// actually do the optimization but someone has to migrate locks in the buffer
// (expensive), then reset the score back to zero. Each time a txn
// releases locks, the score is incremented by 1.
// - Analysis: If you let the threshold be "C", then at most 1 / C txns will
// do the optimization in a concurrent-transaction system. Similarly, it
// takes at most C txns to start using the single txnid optimzation, which
// is good when the system transitions from multithreaded to single threaded.
//
// STO_BUFFER_MAX_SIZE:
//
// We choose the max value to be 1 million since most transactions are smaller
// than 1 million and we can create a rangetree of 1 million elements in
// less than a second. So we can be pretty confident that this threshold
// enables the optimization almost always, and prevents super pathological
// latency issues for the first lock taken by a second thread.
//
// STO_SCORE_THRESHOLD:
//
// A simple first guess at a good value for the score threshold is 100.
// By our analysis, we'd end up doing the optimization in vain for
// around 1% of all transactions, which seems reasonable. Further,
// if the system goes single threaded, it ought to be pretty quick
// for 100 transactions to go by, so we won't have to wait long before
// we start doing the single txind optimzation again.
static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
static const int STO_SCORE_THRESHOLD = 100;
int m_sto_score;
// statistics about time spent ending the STO early
uint64_t m_sto_end_early_count;
tokutime_t m_sto_end_early_time;
// effect: begins the single txnid optimizaiton, setting m_sto_txnid
// to the given txnid.
// requires: m_sto_txnid is invalid
void sto_begin(TXNID txnid);
// effect: append a range to the sto buffer
// requires: m_sto_txnid is valid
void sto_append(const DBT *left_key, const DBT *right_key,
bool is_write_request);
// effect: ends the single txnid optimization, releaseing any memory
// stored in the sto buffer, notifying the tracker, and
// invalidating m_sto_txnid.
// requires: m_sto_txnid is valid
void sto_end(void);
// params: prepared_lkr is a void * to a prepared locked keyrange. see below.
// effect: ends the single txnid optimization early, migrating buffer locks
// into the rangetree, calling sto_end(), and then setting the
// sto_score back to zero.
// requires: m_sto_txnid is valid
void sto_end_early(void *prepared_lkr);
void sto_end_early_no_accounting(void *prepared_lkr);
// params: prepared_lkr is a void * to a prepared locked keyrange. we can't
// use
// the real type because the compiler won't allow us to forward
// declare concurrent_tree::locked_keyrange without including
// concurrent_tree.h, which we cannot do here because it is a template
// implementation.
// requires: the prepared locked keyrange is for the locktree's rangetree
// requires: m_sto_txnid is valid
// effect: migrates each lock in the single txnid buffer into the locktree's
// rangetree, notifying the memory tracker as necessary.
void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
// effect: If m_sto_txnid is valid, then release the txnid's locks
// by ending the optimization.
// requires: If m_sto_txnid is valid, it is equal to the given txnid
// returns: True if locks were released for this txnid
bool sto_try_release(TXNID txnid);
// params: prepared_lkr is a void * to a prepared locked keyrange. see above.
// requires: the prepared locked keyrange is for the locktree's rangetree
// effect: If m_sto_txnid is valid and equal to the given txnid, then
// append a range onto the buffer. Otherwise, if m_sto_txnid is valid
// but not equal to this txnid, then migrate the buffer's locks
// into the rangetree and end the optimization, setting the score
// back to zero.
// returns: true if the lock was acquired for this txnid
bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key,
const DBT *right_key, bool is_write_request);
// Effect:
// Provides a hook for a helgrind suppression.
// Returns:
// true if m_sto_txnid is not TXNID_NONE
bool sto_txnid_is_valid_unsafe(void) const;
// Effect:
// Provides a hook for a helgrind suppression.
// Returns:
// m_sto_score
int sto_get_score_unsafe(void) const;
void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key,
const DBT *right_key);
int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
const DBT *left_key, const DBT *right_key,
bool is_write_request, txnid_set *conflicts);
int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
const DBT *right_key, txnid_set *conflicts);
int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
const DBT *right_key, txnid_set *conflicts,
bool big_txn);
friend class locktree_unit_test;
friend class manager_unit_test;
friend class lock_request_unit_test;
// engine status reaches into the locktree to read some stats
friend void locktree_manager::get_status(LTM_STATUS status);
};
} /* namespace toku */

@ -0,0 +1,526 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include <stdlib.h>
#include <string.h>
#include "../portability/toku_pthread.h"
#include "../util/status.h"
#include "lock_request.h"
#include "locktree.h"
namespace toku {
void locktree_manager::create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
lt_escalate_cb escalate_cb, void *escalate_extra,
toku_external_mutex_factory_t mutex_factory_arg) {
mutex_factory = mutex_factory_arg;
m_max_lock_memory = DEFAULT_MAX_LOCK_MEMORY;
m_current_lock_memory = 0;
m_locktree_map.create();
m_lt_create_callback = create_cb;
m_lt_destroy_callback = destroy_cb;
m_lt_escalate_callback = escalate_cb;
m_lt_escalate_callback_extra = escalate_extra;
ZERO_STRUCT(m_mutex);
toku_mutex_init(manager_mutex_key, &m_mutex, nullptr);
ZERO_STRUCT(m_lt_counters);
escalator_init();
}
void locktree_manager::destroy(void) {
escalator_destroy();
invariant(m_current_lock_memory == 0);
invariant(m_locktree_map.size() == 0);
m_locktree_map.destroy();
toku_mutex_destroy(&m_mutex);
}
void locktree_manager::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
void locktree_manager::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
size_t locktree_manager::get_max_lock_memory(void) { return m_max_lock_memory; }
int locktree_manager::set_max_lock_memory(size_t max_lock_memory) {
int r = 0;
mutex_lock();
if (max_lock_memory < m_current_lock_memory) {
r = EDOM;
} else {
m_max_lock_memory = max_lock_memory;
}
mutex_unlock();
return r;
}
int locktree_manager::find_by_dict_id(locktree *const &lt,
const DICTIONARY_ID &dict_id) {
if (lt->get_dict_id().dictid < dict_id.dictid) {
return -1;
} else if (lt->get_dict_id().dictid == dict_id.dictid) {
return 0;
} else {
return 1;
}
}
locktree *locktree_manager::locktree_map_find(const DICTIONARY_ID &dict_id) {
locktree *lt;
int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(dict_id, &lt,
nullptr);
return r == 0 ? lt : nullptr;
}
void locktree_manager::locktree_map_put(locktree *lt) {
int r = m_locktree_map.insert<DICTIONARY_ID, find_by_dict_id>(
lt, lt->get_dict_id(), nullptr);
invariant_zero(r);
}
void locktree_manager::locktree_map_remove(locktree *lt) {
uint32_t idx;
locktree *found_lt;
int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(
lt->get_dict_id(), &found_lt, &idx);
invariant_zero(r);
invariant(found_lt == lt);
r = m_locktree_map.delete_at(idx);
invariant_zero(r);
}
locktree *locktree_manager::get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
void *on_create_extra) {
// hold the mutex around searching and maybe
// inserting into the locktree map
mutex_lock();
locktree *lt = locktree_map_find(dict_id);
if (lt == nullptr) {
XCALLOC(lt);
lt->create(this, dict_id, cmp, mutex_factory);
// new locktree created - call the on_create callback
// and put it in the locktree map
if (m_lt_create_callback) {
int r = m_lt_create_callback(lt, on_create_extra);
if (r != 0) {
lt->release_reference();
lt->destroy();
toku_free(lt);
lt = nullptr;
}
}
if (lt) {
locktree_map_put(lt);
}
} else {
reference_lt(lt);
}
mutex_unlock();
return lt;
}
void locktree_manager::reference_lt(locktree *lt) {
// increment using a sync fetch and add.
// the caller guarantees that the lt won't be
// destroyed while we increment the count here.
//
// the caller can do this by already having an lt
// reference or by holding the manager mutex.
//
// if the manager's mutex is held, it is ok for the
// reference count to transition from 0 to 1 (no race),
// since we're serialized with other opens and closes.
lt->add_reference();
}
void locktree_manager::release_lt(locktree *lt) {
bool do_destroy = false;
DICTIONARY_ID dict_id = lt->get_dict_id();
// Release a reference on the locktree. If the count transitions to zero,
// then we *may* need to do the cleanup.
//
// Grab the manager's mutex and look for a locktree with this locktree's
// dictionary id. Since dictionary id's never get reused, any locktree
// found must be the one we just released a reference on.
//
// At least two things could have happened since we got the mutex:
// - Another thread gets a locktree with the same dict_id, increments
// the reference count. In this case, we shouldn't destroy it.
// - Another thread gets a locktree with the same dict_id and then
// releases it quickly, transitioning the reference count from zero to
// one and back to zero. In this case, only one of us should destroy it.
// It doesn't matter which. We originally missed this case, see #5776.
//
// After 5776, the high level rule for release is described below.
//
// If a thread releases a locktree and notices the reference count transition
// to zero, then that thread must immediately:
// - assume the locktree object is invalid
// - grab the manager's mutex
// - search the locktree map for a locktree with the same dict_id and remove
// it, if it exists. the destroy may be deferred.
// - release the manager's mutex
//
// This way, if many threads transition the same locktree's reference count
// from 1 to zero and wait behind the manager's mutex, only one of them will
// do the actual destroy and the others will happily do nothing.
uint32_t refs = lt->release_reference();
if (refs == 0) {
mutex_lock();
// lt may not have already been destroyed, so look it up.
locktree *find_lt = locktree_map_find(dict_id);
if (find_lt != nullptr) {
// A locktree is still in the map with that dict_id, so it must be
// equal to lt. This is true because dictionary ids are never reused.
// If the reference count is zero, it's our responsibility to remove
// it and do the destroy. Otherwise, someone still wants it.
// If the locktree is still valid then check if it should be deleted.
if (find_lt == lt) {
if (lt->get_reference_count() == 0) {
locktree_map_remove(lt);
do_destroy = true;
}
m_lt_counters.add(lt->get_lock_request_info()->counters);
}
}
mutex_unlock();
}
// if necessary, do the destroy without holding the mutex
if (do_destroy) {
if (m_lt_destroy_callback) {
m_lt_destroy_callback(lt);
}
lt->destroy();
toku_free(lt);
}
}
void locktree_manager::run_escalation(void) {
struct escalation_fn {
static void run(void *extra) {
locktree_manager *mgr = (locktree_manager *)extra;
mgr->escalate_all_locktrees();
};
};
m_escalator.run(this, escalation_fn::run, this);
}
// test-only version of lock escalation
void locktree_manager::run_escalation_for_test(void) { run_escalation(); }
void locktree_manager::escalate_all_locktrees(void) {
uint64_t t0 = toku_current_time_microsec();
// get all locktrees
mutex_lock();
int num_locktrees = m_locktree_map.size();
locktree **locktrees = new locktree *[num_locktrees];
for (int i = 0; i < num_locktrees; i++) {
int r = m_locktree_map.fetch(i, &locktrees[i]);
invariant_zero(r);
reference_lt(locktrees[i]);
}
mutex_unlock();
// escalate them
escalate_locktrees(locktrees, num_locktrees);
delete[] locktrees;
uint64_t t1 = toku_current_time_microsec();
add_escalator_wait_time(t1 - t0);
}
void locktree_manager::note_mem_used(uint64_t mem_used) {
(void)toku_sync_fetch_and_add(&m_current_lock_memory, mem_used);
}
void locktree_manager::note_mem_released(uint64_t mem_released) {
uint64_t old_mem_used =
toku_sync_fetch_and_sub(&m_current_lock_memory, mem_released);
invariant(old_mem_used >= mem_released);
}
bool locktree_manager::out_of_locks(void) const {
return m_current_lock_memory >= m_max_lock_memory;
}
bool locktree_manager::over_big_threshold(void) {
return m_current_lock_memory >= m_max_lock_memory / 2;
}
int locktree_manager::iterate_pending_lock_requests(
lock_request_iterate_callback callback, void *extra) {
mutex_lock();
int r = 0;
uint32_t num_locktrees = m_locktree_map.size();
for (uint32_t i = 0; i < num_locktrees && r == 0; i++) {
locktree *lt;
r = m_locktree_map.fetch(i, &lt);
invariant_zero(r);
if (r == EINVAL) // Shouldn't happen, avoid compiler warning
continue;
struct lt_lock_request_info *info = lt->get_lock_request_info();
toku_external_mutex_lock(&info->mutex);
uint32_t num_requests = info->pending_lock_requests.size();
for (uint32_t k = 0; k < num_requests && r == 0; k++) {
lock_request *req;
r = info->pending_lock_requests.fetch(k, &req);
invariant_zero(r);
if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
continue;
r = callback(lt->get_dict_id(), req->get_txnid(), req->get_left_key(),
req->get_right_key(), req->get_conflicting_txnid(),
req->get_start_time(), extra);
}
toku_external_mutex_unlock(&info->mutex);
}
mutex_unlock();
return r;
}
int locktree_manager::check_current_lock_constraints(bool big_txn) {
int r = 0;
if (big_txn && over_big_threshold()) {
run_escalation();
if (over_big_threshold()) {
r = TOKUDB_OUT_OF_LOCKS;
}
}
if (r == 0 && out_of_locks()) {
run_escalation();
if (out_of_locks()) {
// return an error if we're still out of locks after escalation.
r = TOKUDB_OUT_OF_LOCKS;
}
}
return r;
}
void locktree_manager::escalator_init(void) {
ZERO_STRUCT(m_escalation_mutex);
toku_mutex_init(manager_escalation_mutex_key, &m_escalation_mutex, nullptr);
m_escalation_count = 0;
m_escalation_time = 0;
m_wait_escalation_count = 0;
m_wait_escalation_time = 0;
m_long_wait_escalation_count = 0;
m_long_wait_escalation_time = 0;
m_escalation_latest_result = 0;
m_escalator.create();
}
void locktree_manager::escalator_destroy(void) {
m_escalator.destroy();
toku_mutex_destroy(&m_escalation_mutex);
}
void locktree_manager::add_escalator_wait_time(uint64_t t) {
toku_mutex_lock(&m_escalation_mutex);
m_wait_escalation_count += 1;
m_wait_escalation_time += t;
if (t >= 1000000) {
m_long_wait_escalation_count += 1;
m_long_wait_escalation_time += t;
}
toku_mutex_unlock(&m_escalation_mutex);
}
void locktree_manager::escalate_locktrees(locktree **locktrees,
int num_locktrees) {
// there are too many row locks in the system and we need to tidy up.
//
// a simple implementation of escalation does not attempt
// to reduce the memory foot print of each txn's range buffer.
// doing so would require some layering hackery (or a callback)
// and more complicated locking. for now, just escalate each
// locktree individually, in-place.
tokutime_t t0 = toku_time_now();
for (int i = 0; i < num_locktrees; i++) {
locktrees[i]->escalate(m_lt_escalate_callback,
m_lt_escalate_callback_extra);
release_lt(locktrees[i]);
}
tokutime_t t1 = toku_time_now();
toku_mutex_lock(&m_escalation_mutex);
m_escalation_count++;
m_escalation_time += (t1 - t0);
m_escalation_latest_result = m_current_lock_memory;
toku_mutex_unlock(&m_escalation_mutex);
}
struct escalate_args {
locktree_manager *mgr;
locktree **locktrees;
int num_locktrees;
};
void locktree_manager::locktree_escalator::create(void) {
ZERO_STRUCT(m_escalator_mutex);
toku_mutex_init(manager_escalator_mutex_key, &m_escalator_mutex, nullptr);
toku_cond_init(manager_m_escalator_done_key, &m_escalator_done, nullptr);
m_escalator_running = false;
}
void locktree_manager::locktree_escalator::destroy(void) {
toku_cond_destroy(&m_escalator_done);
toku_mutex_destroy(&m_escalator_mutex);
}
void locktree_manager::locktree_escalator::run(
locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
void *extra) {
uint64_t t0 = toku_current_time_microsec();
toku_mutex_lock(&m_escalator_mutex);
if (!m_escalator_running) {
// run escalation on this thread
m_escalator_running = true;
toku_mutex_unlock(&m_escalator_mutex);
escalate_locktrees_fun(extra);
toku_mutex_lock(&m_escalator_mutex);
m_escalator_running = false;
toku_cond_broadcast(&m_escalator_done);
} else {
toku_cond_wait(&m_escalator_done, &m_escalator_mutex);
}
toku_mutex_unlock(&m_escalator_mutex);
uint64_t t1 = toku_current_time_microsec();
mgr->add_escalator_wait_time(t1 - t0);
}
void locktree_manager::get_status(LTM_STATUS statp) {
ltm_status.init();
LTM_STATUS_VAL(LTM_SIZE_CURRENT) = m_current_lock_memory;
LTM_STATUS_VAL(LTM_SIZE_LIMIT) = m_max_lock_memory;
LTM_STATUS_VAL(LTM_ESCALATION_COUNT) = m_escalation_count;
LTM_STATUS_VAL(LTM_ESCALATION_TIME) = m_escalation_time;
LTM_STATUS_VAL(LTM_ESCALATION_LATEST_RESULT) = m_escalation_latest_result;
LTM_STATUS_VAL(LTM_WAIT_ESCALATION_COUNT) = m_wait_escalation_count;
LTM_STATUS_VAL(LTM_WAIT_ESCALATION_TIME) = m_wait_escalation_time;
LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_COUNT) = m_long_wait_escalation_count;
LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_TIME) = m_long_wait_escalation_time;
uint64_t lock_requests_pending = 0;
uint64_t sto_num_eligible = 0;
uint64_t sto_end_early_count = 0;
tokutime_t sto_end_early_time = 0;
uint32_t num_locktrees = 0;
struct lt_counters lt_counters;
ZERO_STRUCT(lt_counters); // PORT: instead of ={}.
if (toku_mutex_trylock(&m_mutex) == 0) {
lt_counters = m_lt_counters;
num_locktrees = m_locktree_map.size();
for (uint32_t i = 0; i < num_locktrees; i++) {
locktree *lt;
int r = m_locktree_map.fetch(i, &lt);
invariant_zero(r);
if (r == EINVAL) // Shouldn't happen, avoid compiler warning
continue;
if (toku_external_mutex_trylock(&lt->m_lock_request_info.mutex) == 0) {
lock_requests_pending +=
lt->m_lock_request_info.pending_lock_requests.size();
lt_counters.add(lt->get_lock_request_info()->counters);
toku_external_mutex_unlock(&lt->m_lock_request_info.mutex);
}
sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0;
sto_end_early_count += lt->m_sto_end_early_count;
sto_end_early_time += lt->m_sto_end_early_time;
}
mutex_unlock();
}
LTM_STATUS_VAL(LTM_NUM_LOCKTREES) = num_locktrees;
LTM_STATUS_VAL(LTM_LOCK_REQUESTS_PENDING) = lock_requests_pending;
LTM_STATUS_VAL(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible;
LTM_STATUS_VAL(LTM_STO_END_EARLY_COUNT) = sto_end_early_count;
LTM_STATUS_VAL(LTM_STO_END_EARLY_TIME) = sto_end_early_time;
LTM_STATUS_VAL(LTM_WAIT_COUNT) = lt_counters.wait_count;
LTM_STATUS_VAL(LTM_WAIT_TIME) = lt_counters.wait_time;
LTM_STATUS_VAL(LTM_LONG_WAIT_COUNT) = lt_counters.long_wait_count;
LTM_STATUS_VAL(LTM_LONG_WAIT_TIME) = lt_counters.long_wait_time;
LTM_STATUS_VAL(LTM_TIMEOUT_COUNT) = lt_counters.timeout_count;
*statp = ltm_status;
}
void locktree_manager::kill_waiter(void *extra) {
mutex_lock();
int r = 0;
uint32_t num_locktrees = m_locktree_map.size();
for (uint32_t i = 0; i < num_locktrees; i++) {
locktree *lt;
r = m_locktree_map.fetch(i, &lt);
invariant_zero(r);
if (r) continue; // Get rid of "may be used uninitialized" warning
lock_request::kill_waiter(lt, extra);
}
mutex_unlock();
}
} /* namespace toku */
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,264 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "range_buffer.h"
#include <string.h>
#include "../portability/memory.h"
#include "../util/dbt.h"
namespace toku {
bool range_buffer::record_header::left_is_infinite(void) const {
return left_neg_inf || left_pos_inf;
}
bool range_buffer::record_header::right_is_infinite(void) const {
return right_neg_inf || right_pos_inf;
}
void range_buffer::record_header::init(const DBT *left_key,
const DBT *right_key,
bool is_exclusive) {
is_exclusive_lock = is_exclusive;
left_neg_inf = left_key == toku_dbt_negative_infinity();
left_pos_inf = left_key == toku_dbt_positive_infinity();
left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size;
if (right_key) {
right_neg_inf = right_key == toku_dbt_negative_infinity();
right_pos_inf = right_key == toku_dbt_positive_infinity();
right_key_size = toku_dbt_is_infinite(right_key) ? 0 : right_key->size;
} else {
right_neg_inf = left_neg_inf;
right_pos_inf = left_pos_inf;
right_key_size = 0;
}
}
const DBT *range_buffer::iterator::record::get_left_key(void) const {
if (_header.left_neg_inf) {
return toku_dbt_negative_infinity();
} else if (_header.left_pos_inf) {
return toku_dbt_positive_infinity();
} else {
return &_left_key;
}
}
const DBT *range_buffer::iterator::record::get_right_key(void) const {
if (_header.right_neg_inf) {
return toku_dbt_negative_infinity();
} else if (_header.right_pos_inf) {
return toku_dbt_positive_infinity();
} else {
return &_right_key;
}
}
size_t range_buffer::iterator::record::size(void) const {
return sizeof(record_header) + _header.left_key_size + _header.right_key_size;
}
void range_buffer::iterator::record::deserialize(const char *buf) {
size_t current = 0;
// deserialize the header
memcpy(&_header, buf, sizeof(record_header));
current += sizeof(record_header);
// deserialize the left key if necessary
if (!_header.left_is_infinite()) {
// point the left DBT's buffer into ours
toku_fill_dbt(&_left_key, buf + current, _header.left_key_size);
current += _header.left_key_size;
}
// deserialize the right key if necessary
if (!_header.right_is_infinite()) {
if (_header.right_key_size == 0) {
toku_copyref_dbt(&_right_key, _left_key);
} else {
toku_fill_dbt(&_right_key, buf + current, _header.right_key_size);
}
}
}
toku::range_buffer::iterator::iterator()
: _ma_chunk_iterator(nullptr),
_current_chunk_base(nullptr),
_current_chunk_offset(0),
_current_chunk_max(0),
_current_rec_size(0) {}
toku::range_buffer::iterator::iterator(const range_buffer *buffer)
: _ma_chunk_iterator(&buffer->_arena),
_current_chunk_base(nullptr),
_current_chunk_offset(0),
_current_chunk_max(0),
_current_rec_size(0) {
reset_current_chunk();
}
void range_buffer::iterator::reset_current_chunk() {
_current_chunk_base = _ma_chunk_iterator.current(&_current_chunk_max);
_current_chunk_offset = 0;
}
bool range_buffer::iterator::current(record *rec) {
if (_current_chunk_offset < _current_chunk_max) {
const char *buf = reinterpret_cast<const char *>(_current_chunk_base);
rec->deserialize(buf + _current_chunk_offset);
_current_rec_size = rec->size();
return true;
} else {
return false;
}
}
// move the iterator to the next record in the buffer
void range_buffer::iterator::next(void) {
invariant(_current_chunk_offset < _current_chunk_max);
invariant(_current_rec_size > 0);
// the next record is _current_rec_size bytes forward
_current_chunk_offset += _current_rec_size;
// now, we don't know how big the current is, set it to 0.
_current_rec_size = 0;
if (_current_chunk_offset >= _current_chunk_max) {
// current chunk is exhausted, try moving to the next one
if (_ma_chunk_iterator.more()) {
_ma_chunk_iterator.next();
reset_current_chunk();
}
}
}
void range_buffer::create(void) {
// allocate buffer space lazily instead of on creation. this way,
// no malloc/free is done if the transaction ends up taking no locks.
_arena.create(0);
_num_ranges = 0;
}
void range_buffer::append(const DBT *left_key, const DBT *right_key,
bool is_write_request) {
// if the keys are equal, then only one copy is stored.
if (toku_dbt_equals(left_key, right_key)) {
invariant(left_key->size <= MAX_KEY_SIZE);
append_point(left_key, is_write_request);
} else {
invariant(left_key->size <= MAX_KEY_SIZE);
invariant(right_key->size <= MAX_KEY_SIZE);
append_range(left_key, right_key, is_write_request);
}
_num_ranges++;
}
bool range_buffer::is_empty(void) const { return total_memory_size() == 0; }
uint64_t range_buffer::total_memory_size(void) const {
return _arena.total_size_in_use();
}
int range_buffer::get_num_ranges(void) const { return _num_ranges; }
void range_buffer::destroy(void) { _arena.destroy(); }
void range_buffer::append_range(const DBT *left_key, const DBT *right_key,
bool is_exclusive) {
size_t record_length =
sizeof(record_header) + left_key->size + right_key->size;
char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
record_header h;
h.init(left_key, right_key, is_exclusive);
// serialize the header
memcpy(buf, &h, sizeof(record_header));
buf += sizeof(record_header);
// serialize the left key if necessary
if (!h.left_is_infinite()) {
memcpy(buf, left_key->data, left_key->size);
buf += left_key->size;
}
// serialize the right key if necessary
if (!h.right_is_infinite()) {
memcpy(buf, right_key->data, right_key->size);
}
}
void range_buffer::append_point(const DBT *key, bool is_exclusive) {
size_t record_length = sizeof(record_header) + key->size;
char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
record_header h;
h.init(key, nullptr, is_exclusive);
// serialize the header
memcpy(buf, &h, sizeof(record_header));
buf += sizeof(record_header);
// serialize the key if necessary
if (!h.left_is_infinite()) {
memcpy(buf, key->data, key->size);
}
}
} /* namespace toku */
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,177 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <inttypes.h>
#include <stdint.h>
#include "../util/dbt.h"
#include "../util/memarena.h"
namespace toku {
// a key range buffer represents a set of key ranges that can
// be stored, iterated over, and then destroyed all at once.
class range_buffer {
private:
// the key range buffer is a bunch of records in a row.
// each record has the following header, followed by the
// left key and right key data payload, if applicable.
// we limit keys to be 2^16, since we store lengths as 2 bytes.
static const size_t MAX_KEY_SIZE = 1 << 16;
struct record_header {
bool left_neg_inf;
bool left_pos_inf;
bool right_pos_inf;
bool right_neg_inf;
uint16_t left_key_size;
uint16_t right_key_size;
bool is_exclusive_lock;
bool left_is_infinite(void) const;
bool right_is_infinite(void) const;
void init(const DBT *left_key, const DBT *right_key, bool is_exclusive);
};
// PORT static_assert(sizeof(record_header) == 8, "record header format is
// off");
public:
// the iterator abstracts reading over a buffer of variable length
// records one by one until there are no more left.
class iterator {
public:
iterator();
iterator(const range_buffer *buffer);
// a record represents the user-view of a serialized key range.
// it handles positive and negative infinity and the optimized
// point range case, where left and right points share memory.
class record {
public:
// get a read-only pointer to the left key of this record's range
const DBT *get_left_key(void) const;
// get a read-only pointer to the right key of this record's range
const DBT *get_right_key(void) const;
// how big is this record? this tells us where the next record is
size_t size(void) const;
bool get_exclusive_flag() const { return _header.is_exclusive_lock; }
// populate a record header and point our DBT's
// buffers into ours if they are not infinite.
void deserialize(const char *buf);
private:
record_header _header;
DBT _left_key;
DBT _right_key;
};
// populate the given record object with the current
// the memory referred to by record is valid for only
// as long as the record exists.
bool current(record *rec);
// move the iterator to the next record in the buffer
void next(void);
private:
void reset_current_chunk();
// the key range buffer we are iterating over, the current
// offset in that buffer, and the size of the current record.
memarena::chunk_iterator _ma_chunk_iterator;
const void *_current_chunk_base;
size_t _current_chunk_offset;
size_t _current_chunk_max;
size_t _current_rec_size;
};
// allocate buffer space lazily instead of on creation. this way,
// no malloc/free is done if the transaction ends up taking no locks.
void create(void);
// append a left/right key range to the buffer.
// if the keys are equal, then only one copy is stored.
void append(const DBT *left_key, const DBT *right_key,
bool is_write_request = false);
// is this range buffer empty?
bool is_empty(void) const;
// how much memory is being used by this range buffer?
uint64_t total_memory_size(void) const;
// how many ranges are stored in this range buffer?
int get_num_ranges(void) const;
void destroy(void);
private:
memarena _arena;
int _num_ranges;
void append_range(const DBT *left_key, const DBT *right_key,
bool is_write_request);
// append a point to the buffer. this is the space/time saving
// optimization for key ranges where left == right.
void append_point(const DBT *key, bool is_write_request);
};
} /* namespace toku */

@ -0,0 +1,519 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "treenode.h"
#include "../portability/toku_race_tools.h"
namespace toku {
// TODO: source location info might have to be pulled up one caller
// to be useful
void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
void treenode::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
void treenode::init(const comparator *cmp) {
m_txnid = TXNID_NONE;
m_is_root = false;
m_is_empty = true;
m_cmp = cmp;
m_is_shared = false;
m_owners = nullptr;
// use an adaptive mutex at each node since we expect the time the
// lock is held to be relatively short compared to a context switch.
// indeed, this improves performance at high thread counts considerably.
memset(&m_mutex, 0, sizeof(toku_mutex_t));
toku_pthread_mutexattr_t attr;
toku_mutexattr_init(&attr);
toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE);
toku_mutex_init(treenode_mutex_key, &m_mutex, &attr);
toku_mutexattr_destroy(&attr);
m_left_child.set(nullptr);
m_right_child.set(nullptr);
}
void treenode::create_root(const comparator *cmp) {
init(cmp);
m_is_root = true;
}
void treenode::destroy_root(void) {
invariant(is_root());
invariant(is_empty());
toku_mutex_destroy(&m_mutex);
m_cmp = nullptr;
}
void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid,
bool is_shared) {
// allocates a new copy of the range for this node
m_range.create_copy(range);
m_txnid = txnid;
m_is_shared = is_shared;
m_is_empty = false;
}
bool treenode::is_root(void) { return m_is_root; }
bool treenode::is_empty(void) { return m_is_empty; }
bool treenode::range_overlaps(const keyrange &range) {
return m_range.overlaps(*m_cmp, range);
}
treenode *treenode::alloc(const comparator *cmp, const keyrange &range,
TXNID txnid, bool is_shared) {
treenode *XCALLOC(node);
node->init(cmp);
node->set_range_and_txnid(range, txnid, is_shared);
return node;
}
void treenode::swap_in_place(treenode *node1, treenode *node2) {
keyrange tmp_range = node1->m_range;
TXNID tmp_txnid = node1->m_txnid;
node1->m_range = node2->m_range;
node1->m_txnid = node2->m_txnid;
node2->m_range = tmp_range;
node2->m_txnid = tmp_txnid;
bool tmp_is_shared = node1->m_is_shared;
node1->m_is_shared = node2->m_is_shared;
node2->m_is_shared = tmp_is_shared;
auto tmp_m_owners = node1->m_owners;
node1->m_owners = node2->m_owners;
node2->m_owners = tmp_m_owners;
}
bool treenode::add_shared_owner(TXNID txnid) {
assert(m_is_shared);
if (txnid == m_txnid)
return false; // acquiring a lock on the same range by the same trx
if (m_txnid != TXNID_SHARED) {
m_owners = new TxnidVector;
m_owners->insert(m_txnid);
m_txnid = TXNID_SHARED;
}
m_owners->insert(txnid);
return true;
}
void treenode::free(treenode *node) {
// destroy the range, freeing any copied keys
node->m_range.destroy();
if (node->m_owners) {
delete node->m_owners;
node->m_owners = nullptr; // need this?
}
// the root is simply marked as empty.
if (node->is_root()) {
// PORT toku_mutex_assert_locked(&node->m_mutex);
node->m_is_empty = true;
} else {
// PORT toku_mutex_assert_unlocked(&node->m_mutex);
toku_mutex_destroy(&node->m_mutex);
toku_free(node);
}
}
uint32_t treenode::get_depth_estimate(void) const {
const uint32_t left_est = m_left_child.depth_est;
const uint32_t right_est = m_right_child.depth_est;
return (left_est > right_est ? left_est : right_est) + 1;
}
treenode *treenode::find_node_with_overlapping_child(
const keyrange &range, const keyrange::comparison *cmp_hint) {
// determine which child to look at based on a comparison. if we were
// given a comparison hint, use that. otherwise, compare them now.
keyrange::comparison c =
cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range);
treenode *child;
if (c == keyrange::comparison::LESS_THAN) {
child = lock_and_rebalance_left();
} else {
// The caller (locked_keyrange::acquire) handles the case where
// the root of the locked_keyrange is the node that overlaps.
// range is guaranteed not to overlap this node.
invariant(c == keyrange::comparison::GREATER_THAN);
child = lock_and_rebalance_right();
}
// if the search would lead us to an empty subtree (child == nullptr),
// or the child overlaps, then we know this node is the parent we want.
// otherwise we need to recur into that child.
if (child == nullptr) {
return this;
} else {
c = range.compare(*m_cmp, child->m_range);
if (c == keyrange::comparison::EQUALS ||
c == keyrange::comparison::OVERLAPS) {
child->mutex_unlock();
return this;
} else {
// unlock this node before recurring into the locked child,
// passing in a comparison hint since we just comapred range
// to the child's range.
mutex_unlock();
return child->find_node_with_overlapping_child(range, &c);
}
}
}
bool treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) {
int rc = true;
// choose a child to check. if that child is null, then insert the new node
// there. otherwise recur down that child's subtree
keyrange::comparison c = range.compare(*m_cmp, m_range);
if (c == keyrange::comparison::LESS_THAN) {
treenode *left_child = lock_and_rebalance_left();
if (left_child == nullptr) {
left_child = treenode::alloc(m_cmp, range, txnid, is_shared);
m_left_child.set(left_child);
} else {
left_child->insert(range, txnid, is_shared);
left_child->mutex_unlock();
}
} else if (c == keyrange::comparison::GREATER_THAN) {
// invariant(c == keyrange::comparison::GREATER_THAN);
treenode *right_child = lock_and_rebalance_right();
if (right_child == nullptr) {
right_child = treenode::alloc(m_cmp, range, txnid, is_shared);
m_right_child.set(right_child);
} else {
right_child->insert(range, txnid, is_shared);
right_child->mutex_unlock();
}
} else if (c == keyrange::comparison::EQUALS) {
invariant(is_shared);
invariant(m_is_shared);
rc = add_shared_owner(txnid);
} else {
invariant(0);
}
return rc;
}
treenode *treenode::find_child_at_extreme(int direction, treenode **parent) {
treenode *child =
direction > 0 ? m_right_child.get_locked() : m_left_child.get_locked();
if (child) {
*parent = this;
treenode *child_extreme = child->find_child_at_extreme(direction, parent);
child->mutex_unlock();
return child_extreme;
} else {
return this;
}
}
treenode *treenode::find_leftmost_child(treenode **parent) {
return find_child_at_extreme(-1, parent);
}
treenode *treenode::find_rightmost_child(treenode **parent) {
return find_child_at_extreme(1, parent);
}
treenode *treenode::remove_root_of_subtree() {
// if this node has no children, just free it and return null
if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) {
// treenode::free requires that non-root nodes are unlocked
if (!is_root()) {
mutex_unlock();
}
treenode::free(this);
return nullptr;
}
// we have a child, so get either the in-order successor or
// predecessor of this node to be our replacement.
// replacement_parent is updated by the find functions as
// they recur down the tree, so initialize it to this.
treenode *child, *replacement;
treenode *replacement_parent = this;
if (m_left_child.ptr != nullptr) {
child = m_left_child.get_locked();
replacement = child->find_rightmost_child(&replacement_parent);
invariant(replacement == child || replacement_parent != this);
// detach the replacement from its parent
if (replacement_parent == this) {
m_left_child = replacement->m_left_child;
} else {
replacement_parent->m_right_child = replacement->m_left_child;
}
} else {
child = m_right_child.get_locked();
replacement = child->find_leftmost_child(&replacement_parent);
invariant(replacement == child || replacement_parent != this);
// detach the replacement from its parent
if (replacement_parent == this) {
m_right_child = replacement->m_right_child;
} else {
replacement_parent->m_left_child = replacement->m_right_child;
}
}
child->mutex_unlock();
// swap in place with the detached replacement, then destroy it
treenode::swap_in_place(replacement, this);
treenode::free(replacement);
return this;
}
void treenode::recursive_remove(void) {
treenode *left = m_left_child.ptr;
if (left) {
left->recursive_remove();
}
m_left_child.set(nullptr);
treenode *right = m_right_child.ptr;
if (right) {
right->recursive_remove();
}
m_right_child.set(nullptr);
// we do not take locks on the way down, so we know non-root nodes
// are unlocked here and the caller is required to pass a locked
// root, so this free is correct.
treenode::free(this);
}
void treenode::remove_shared_owner(TXNID txnid) {
assert(m_owners->size() > 1);
m_owners->erase(txnid);
assert(m_owners->size() > 0);
/* if there is just one owner left, move it to m_txnid */
if (m_owners->size() == 1) {
m_txnid = *m_owners->begin();
delete m_owners;
m_owners = nullptr;
}
}
treenode *treenode::remove(const keyrange &range, TXNID txnid) {
treenode *child;
// if the range is equal to this node's range, then just remove
// the root of this subtree. otherwise search down the tree
// in either the left or right children.
keyrange::comparison c = range.compare(*m_cmp, m_range);
switch (c) {
case keyrange::comparison::EQUALS: {
// if we are the only owners, remove. Otherwise, just remove
// us from the owners list.
if (txnid != TXNID_ANY && has_multiple_owners()) {
remove_shared_owner(txnid);
return this;
} else {
return remove_root_of_subtree();
}
}
case keyrange::comparison::LESS_THAN:
child = m_left_child.get_locked();
invariant_notnull(child);
child = child->remove(range, txnid);
// unlock the child if there still is one.
// regardless, set the right child pointer
if (child) {
child->mutex_unlock();
}
m_left_child.set(child);
break;
case keyrange::comparison::GREATER_THAN:
child = m_right_child.get_locked();
invariant_notnull(child);
child = child->remove(range, txnid);
// unlock the child if there still is one.
// regardless, set the right child pointer
if (child) {
child->mutex_unlock();
}
m_right_child.set(child);
break;
case keyrange::comparison::OVERLAPS:
// shouldn't be overlapping, since the tree is
// non-overlapping and this range must exist
abort();
}
return this;
}
bool treenode::left_imbalanced(int threshold) const {
uint32_t left_depth = m_left_child.depth_est;
uint32_t right_depth = m_right_child.depth_est;
return m_left_child.ptr != nullptr && left_depth > threshold + right_depth;
}
bool treenode::right_imbalanced(int threshold) const {
uint32_t left_depth = m_left_child.depth_est;
uint32_t right_depth = m_right_child.depth_est;
return m_right_child.ptr != nullptr && right_depth > threshold + left_depth;
}
// effect: rebalances the subtree rooted at this node
// using AVL style O(1) rotations. unlocks this
// node if it is not the new root of the subtree.
// requires: node is locked by this thread, children are not
// returns: locked root node of the rebalanced tree
treenode *treenode::maybe_rebalance(void) {
// if we end up not rotating at all, the new root is this
treenode *new_root = this;
treenode *child = nullptr;
if (left_imbalanced(IMBALANCE_THRESHOLD)) {
child = m_left_child.get_locked();
if (child->right_imbalanced(0)) {
treenode *grandchild = child->m_right_child.get_locked();
child->m_right_child = grandchild->m_left_child;
grandchild->m_left_child.set(child);
m_left_child = grandchild->m_right_child;
grandchild->m_right_child.set(this);
new_root = grandchild;
} else {
m_left_child = child->m_right_child;
child->m_right_child.set(this);
new_root = child;
}
} else if (right_imbalanced(IMBALANCE_THRESHOLD)) {
child = m_right_child.get_locked();
if (child->left_imbalanced(0)) {
treenode *grandchild = child->m_left_child.get_locked();
child->m_left_child = grandchild->m_right_child;
grandchild->m_right_child.set(child);
m_right_child = grandchild->m_left_child;
grandchild->m_left_child.set(this);
new_root = grandchild;
} else {
m_right_child = child->m_left_child;
child->m_left_child.set(this);
new_root = child;
}
}
// up to three nodes may be locked.
// - this
// - child
// - grandchild (but if it is locked, its the new root)
//
// one of them is the new root. we unlock everything except the new root.
if (child && child != new_root) {
TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex);
child->mutex_unlock();
}
if (this != new_root) {
TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex);
mutex_unlock();
}
TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex);
return new_root;
}
treenode *treenode::lock_and_rebalance_left(void) {
treenode *child = m_left_child.get_locked();
if (child) {
treenode *new_root = child->maybe_rebalance();
m_left_child.set(new_root);
child = new_root;
}
return child;
}
treenode *treenode::lock_and_rebalance_right(void) {
treenode *child = m_right_child.get_locked();
if (child) {
treenode *new_root = child->maybe_rebalance();
m_right_child.set(new_root);
child = new_root;
}
return child;
}
void treenode::child_ptr::set(treenode *node) {
ptr = node;
depth_est = ptr ? ptr->get_depth_estimate() : 0;
}
treenode *treenode::child_ptr::get_locked(void) {
if (ptr) {
ptr->mutex_lock();
depth_est = ptr->get_depth_estimate();
}
return ptr;
}
} /* namespace toku */
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,301 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <string.h>
#include "../ft/comparator.h"
#include "../portability/memory.h"
#include "../portability/toku_pthread.h"
// PORT: we need LTM_STATUS
#include "../ft/ft-status.h"
#include "../portability/txn_subst.h"
#include "keyrange.h"
namespace toku {
// a node in a tree with its own mutex
// - range is the "key" of this node
// - txnid is the single txnid associated with this node
// - left and right children may be null
//
// to build a tree on top of this abstraction, the user:
// - provides memory for a root node, initializes it via create_root()
// - performs tree operations on the root node. memory management
// below the root node is handled by the abstraction, not the user.
// this pattern:
// - guaruntees a root node always exists.
// - does not allow for rebalances on the root node
class treenode {
public:
// every treenode function has some common requirements:
// - node is locked and children are never locked
// - node may be unlocked if no other thread has visibility
// effect: create the root node
void create_root(const comparator *cmp);
// effect: destroys the root node
void destroy_root(void);
// effect: sets the txnid and copies the given range for this node
void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared);
// returns: true iff this node is marked as empty
bool is_empty(void);
// returns: true if this is the root node, denoted by a null parent
bool is_root(void);
// returns: true if the given range overlaps with this node's range
bool range_overlaps(const keyrange &range);
// effect: locks the node
void mutex_lock(void);
// effect: unlocks the node
void mutex_unlock(void);
// return: node whose child overlaps, or a child that is empty
// and would contain range if it existed
// given: if cmp_hint is non-null, then it is a precomputed
// comparison of this node's range to the given range.
treenode *find_node_with_overlapping_child(
const keyrange &range, const keyrange::comparison *cmp_hint);
// effect: performs an in-order traversal of the ranges that overlap the
// given range, calling function->fn() on each node that does
// requires: function signature is: bool fn(const keyrange &range, TXNID
// txnid) requires: fn returns true to keep iterating, false to stop iterating
// requires: fn does not attempt to use any ranges read out by value
// after removing a node with an overlapping range from the tree.
template <class F>
void traverse_overlaps(const keyrange &range, F *function) {
keyrange::comparison c = range.compare(*m_cmp, m_range);
if (c == keyrange::comparison::EQUALS) {
// Doesn't matter if fn wants to keep going, there
// is nothing left, so return.
function->fn(m_range, m_txnid, m_is_shared, m_owners);
return;
}
treenode *left = m_left_child.get_locked();
if (left) {
if (c != keyrange::comparison::GREATER_THAN) {
// Target range is less than this node, or it overlaps this
// node. There may be something on the left.
left->traverse_overlaps(range, function);
}
left->mutex_unlock();
}
if (c == keyrange::comparison::OVERLAPS) {
bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners);
if (!keep_going) {
return;
}
}
treenode *right = m_right_child.get_locked();
if (right) {
if (c != keyrange::comparison::LESS_THAN) {
// Target range is greater than this node, or it overlaps this
// node. There may be something on the right.
right->traverse_overlaps(range, function);
}
right->mutex_unlock();
}
}
// effect: inserts the given range and txnid into a subtree, recursively
// requires: range does not overlap with any node below the subtree
bool insert(const keyrange &range, TXNID txnid, bool is_shared);
// effect: removes the given range from the subtree
// requires: range exists in the subtree
// returns: the root of the resulting subtree
treenode *remove(const keyrange &range, TXNID txnid);
// effect: removes this node and all of its children, recursively
// requires: every node at and below this node is unlocked
void recursive_remove(void);
private:
// the child_ptr is a light abstraction for the locking of
// a child and the maintenence of its depth estimate.
struct child_ptr {
// set the child pointer
void set(treenode *node);
// get and lock this child if it exists
treenode *get_locked(void);
treenode *ptr;
uint32_t depth_est;
};
// the balance factor at which a node is considered imbalanced
static const int32_t IMBALANCE_THRESHOLD = 2;
// node-level mutex
toku_mutex_t m_mutex;
// the range and txnid for this node. the range contains a copy
// of the keys originally inserted into the tree. nodes may
// swap ranges. but at the end of the day, when a node is
// destroyed, it frees the memory associated with whatever range
// it has at the time of destruction.
keyrange m_range;
void remove_shared_owner(TXNID txnid);
bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); }
private:
// Owner transaction id.
// A value of TXNID_SHARED means this node has multiple owners
TXNID m_txnid;
// If true, this lock is a non-exclusive lock, and it can have either
// one or several owners.
bool m_is_shared;
// List of the owners, or nullptr if there's just one owner.
TxnidVector *m_owners;
// two child pointers
child_ptr m_left_child;
child_ptr m_right_child;
// comparator for ranges
// psergey-todo: Is there any sense to store the comparator in each tree
// node?
const comparator *m_cmp;
// marked for the root node. the root node is never free()'d
// when removed, but instead marked as empty.
bool m_is_root;
// marked for an empty node. only valid for the root.
bool m_is_empty;
// effect: initializes an empty node with the given comparator
void init(const comparator *cmp);
// requires: this is a shared node (m_is_shared==true)
// effect: another transaction is added as an owner.
// returns: true <=> added another owner
// false <=> this transaction is already an owner
bool add_shared_owner(TXNID txnid);
// requires: *parent is initialized to something meaningful.
// requires: subtree is non-empty
// returns: the leftmost child of the given subtree
// returns: a pointer to the parent of said child in *parent, only
// if this function recurred, otherwise it is untouched.
treenode *find_leftmost_child(treenode **parent);
// requires: *parent is initialized to something meaningful.
// requires: subtree is non-empty
// returns: the rightmost child of the given subtree
// returns: a pointer to the parent of said child in *parent, only
// if this function recurred, otherwise it is untouched.
treenode *find_rightmost_child(treenode **parent);
// effect: remove the root of this subtree, destroying the old root
// returns: the new root of the subtree
treenode *remove_root_of_subtree(void);
// requires: subtree is non-empty, direction is not 0
// returns: the child of the subtree at either the left or rightmost extreme
treenode *find_child_at_extreme(int direction, treenode **parent);
// effect: retrieves and possibly rebalances the left child
// returns: a locked left child, if it exists
treenode *lock_and_rebalance_left(void);
// effect: retrieves and possibly rebalances the right child
// returns: a locked right child, if it exists
treenode *lock_and_rebalance_right(void);
// returns: the estimated depth of this subtree
uint32_t get_depth_estimate(void) const;
// returns: true iff left subtree depth is sufficiently less than the right
bool left_imbalanced(int threshold) const;
// returns: true iff right subtree depth is sufficiently greater than the left
bool right_imbalanced(int threshold) const;
// effect: performs an O(1) rebalance, which will "heal" an imbalance by at
// most 1. effect: if the new root is not this node, then this node is
// unlocked. returns: locked node representing the new root of the rebalanced
// subtree
treenode *maybe_rebalance(void);
// returns: allocated treenode populated with a copy of the range and txnid
static treenode *alloc(const comparator *cmp, const keyrange &range,
TXNID txnid, bool is_shared);
// requires: node is a locked root node, or an unlocked non-root node
static void free(treenode *node);
// effect: swaps the range/txnid pairs for node1 and node2.
static void swap_in_place(treenode *node1, treenode *node2);
friend class concurrent_tree_unit_test;
};
} /* namespace toku */

@ -0,0 +1,119 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "txnid_set.h"
#include "../db.h"
namespace toku {
int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b);
int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b) {
if (txnid_a < txnid_b) {
return -1;
} else if (txnid_a == txnid_b) {
return 0;
} else {
return 1;
}
}
void txnid_set::create(void) {
// lazily allocate the underlying omt, since it is common
// to create a txnid set and never put anything in it.
m_txnids.create_no_array();
}
void txnid_set::destroy(void) { m_txnids.destroy(); }
// Return true if the given transaction id is a member of the set.
// Otherwise, return false.
bool txnid_set::contains(TXNID txnid) const {
TXNID find_txnid;
int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, &find_txnid, nullptr);
return r == 0 ? true : false;
}
// Add a given txnid to the set
void txnid_set::add(TXNID txnid) {
int r = m_txnids.insert<TXNID, find_by_txnid>(txnid, txnid, nullptr);
invariant(r == 0 || r == DB_KEYEXIST);
}
// Delete a given txnid from the set.
void txnid_set::remove(TXNID txnid) {
uint32_t idx;
int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, nullptr, &idx);
if (r == 0) {
r = m_txnids.delete_at(idx);
invariant_zero(r);
}
}
// Return the size of the set
uint32_t txnid_set::size(void) const { return m_txnids.size(); }
// Get the ith id in the set, assuming that the set is sorted.
TXNID txnid_set::get(uint32_t i) const {
TXNID txnid;
int r = m_txnids.fetch(i, &txnid);
if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
return TXNID_NONE;
invariant_zero(r);
return txnid;
}
} /* namespace toku */
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,91 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include "../portability/txn_subst.h"
#include "../util/omt.h"
namespace toku {
class txnid_set {
public:
// effect: Creates an empty set. Does not malloc space for
// any entries yet. That is done lazily on add().
void create(void);
// effect: Destroy the set's internals.
void destroy(void);
// returns: True if the given txnid is a member of the set.
bool contains(TXNID id) const;
// effect: Adds a given txnid to the set if it did not exist
void add(TXNID txnid);
// effect: Deletes a txnid from the set if it exists.
void remove(TXNID txnid);
// returns: Size of the set
uint32_t size(void) const;
// returns: The "i'th" id in the set, as if it were sorted.
TXNID get(uint32_t i) const;
private:
toku::omt<TXNID> m_txnids;
friend class txnid_set_unit_test;
};
ENSURE_POD(txnid_set);
} /* namespace toku */

@ -0,0 +1,212 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "../db.h"
#include "../portability/memory.h"
// PORT #include <toku_assert.h>
#include <memory.h>
#include <string.h>
#include "txnid_set.h"
#include "wfg.h"
namespace toku {
// Create a lock request graph
void wfg::create(void) { m_nodes.create(); }
// Destroy the internals of the lock request graph
void wfg::destroy(void) {
uint32_t n_nodes = m_nodes.size();
for (uint32_t i = 0; i < n_nodes; i++) {
node *n;
int r = m_nodes.fetch(i, &n);
invariant_zero(r);
invariant_notnull(n);
if (r) continue; // Get rid of "may be used uninitialized" warning
node::free(n);
}
m_nodes.destroy();
}
// Add an edge (a_id, b_id) to the graph
void wfg::add_edge(TXNID a_txnid, TXNID b_txnid) {
node *a_node = find_create_node(a_txnid);
node *b_node = find_create_node(b_txnid);
a_node->edges.add(b_node->txnid);
}
// Return true if a node with the given transaction id exists in the graph.
// Return false otherwise.
bool wfg::node_exists(TXNID txnid) {
node *n = find_node(txnid);
return n != NULL;
}
bool wfg::cycle_exists_from_node(node *target, node *head,
std::function<void(TXNID)> reporter) {
bool cycle_found = false;
head->visited = true;
uint32_t n_edges = head->edges.size();
for (uint32_t i = 0; i < n_edges && !cycle_found; i++) {
TXNID edge_id = head->edges.get(i);
if (target->txnid == edge_id) {
cycle_found = true;
if (reporter) reporter(edge_id);
} else {
node *new_head = find_node(edge_id);
if (new_head && !new_head->visited) {
cycle_found = cycle_exists_from_node(target, new_head, reporter);
if (cycle_found && reporter) reporter(edge_id);
}
}
}
head->visited = false;
return cycle_found;
}
// Return true if there exists a cycle from a given transaction id in the graph.
// Return false otherwise.
bool wfg::cycle_exists_from_txnid(TXNID txnid,
std::function<void(TXNID)> reporter) {
node *a_node = find_node(txnid);
bool cycles_found = false;
if (a_node) {
cycles_found = cycle_exists_from_node(a_node, a_node, reporter);
}
return cycles_found;
}
// Apply a given function f to all of the nodes in the graph. The apply
// function returns when the function f is called for all of the nodes in the
// graph, or the function f returns non-zero.
void wfg::apply_nodes(int (*fn)(TXNID id, void *extra), void *extra) {
int r = 0;
uint32_t n_nodes = m_nodes.size();
for (uint32_t i = 0; i < n_nodes && r == 0; i++) {
node *n;
r = m_nodes.fetch(i, &n);
invariant_zero(r);
if (r) continue; // Get rid of "may be used uninitialized" warning
r = fn(n->txnid, extra);
}
}
// Apply a given function f to all of the edges whose origin is a given node id.
// The apply function returns when the function f is called for all edges in the
// graph rooted at node id, or the function f returns non-zero.
void wfg::apply_edges(TXNID txnid,
int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
void *extra) {
node *n = find_node(txnid);
if (n) {
int r = 0;
uint32_t n_edges = n->edges.size();
for (uint32_t i = 0; i < n_edges && r == 0; i++) {
r = fn(txnid, n->edges.get(i), extra);
}
}
}
// find node by id
wfg::node *wfg::find_node(TXNID txnid) {
node *n = nullptr;
int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, nullptr);
invariant(r == 0 || r == DB_NOTFOUND);
return n;
}
// this is the omt comparison function
// nodes are compared by their txnid.
int wfg::find_by_txnid(node *const &node_a, const TXNID &txnid_b) {
TXNID txnid_a = node_a->txnid;
if (txnid_a < txnid_b) {
return -1;
} else if (txnid_a == txnid_b) {
return 0;
} else {
return 1;
}
}
// insert a new node
wfg::node *wfg::find_create_node(TXNID txnid) {
node *n;
uint32_t idx;
int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, &idx);
if (r == DB_NOTFOUND) {
n = node::alloc(txnid);
r = m_nodes.insert_at(n, idx);
invariant_zero(r);
}
invariant_notnull(n);
return n;
}
wfg::node *wfg::node::alloc(TXNID txnid) {
node *XCALLOC(n);
n->txnid = txnid;
n->visited = false;
n->edges.create();
return n;
}
void wfg::node::free(wfg::node *n) {
n->edges.destroy();
toku_free(n);
}
} /* namespace toku */
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,123 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <functional>
#include "../util/omt.h"
#include "txnid_set.h"
namespace toku {
// A wfg is a 'wait-for' graph. A directed edge in represents one
// txn waiting for another to finish before it can acquire a lock.
class wfg {
public:
// Create a lock request graph
void create(void);
// Destroy the internals of the lock request graph
void destroy(void);
// Add an edge (a_id, b_id) to the graph
void add_edge(TXNID a_txnid, TXNID b_txnid);
// Return true if a node with the given transaction id exists in the graph.
// Return false otherwise.
bool node_exists(TXNID txnid);
// Return true if there exists a cycle from a given transaction id in the
// graph. Return false otherwise.
bool cycle_exists_from_txnid(TXNID txnid,
std::function<void(TXNID)> reporter);
// Apply a given function f to all of the nodes in the graph. The apply
// function returns when the function f is called for all of the nodes in the
// graph, or the function f returns non-zero.
void apply_nodes(int (*fn)(TXNID txnid, void *extra), void *extra);
// Apply a given function f to all of the edges whose origin is a given node
// id. The apply function returns when the function f is called for all edges
// in the graph rooted at node id, or the function f returns non-zero.
void apply_edges(TXNID txnid,
int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
void *extra);
private:
struct node {
// txnid for this node and the associated set of edges
TXNID txnid;
txnid_set edges;
bool visited;
static node *alloc(TXNID txnid);
static void free(node *n);
};
ENSURE_POD(node);
toku::omt<node *> m_nodes;
node *find_node(TXNID txnid);
node *find_create_node(TXNID txnid);
bool cycle_exists_from_node(node *target, node *head,
std::function<void(TXNID)> reporter);
static int find_by_txnid(node *const &node_a, const TXNID &txnid_b);
};
ENSURE_POD(wfg);
} /* namespace toku */

@ -0,0 +1,201 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <stdlib.h>
#include "toku_portability.h"
/* Percona memory allocation functions and macros.
* These are functions for malloc and free */
int toku_memory_startup(void) __attribute__((constructor));
void toku_memory_shutdown(void) __attribute__((destructor));
/* Generally: errno is set to 0 or a value to indicate problems. */
// Everything should call toku_malloc() instead of malloc(), and toku_calloc()
// instead of calloc() That way the tests can can, e.g., replace the malloc
// function using toku_set_func_malloc().
void *toku_calloc(size_t nmemb, size_t size)
__attribute__((__visibility__("default")));
void *toku_xcalloc(size_t nmemb, size_t size)
__attribute__((__visibility__("default")));
void *toku_malloc(size_t size) __attribute__((__visibility__("default")));
void *toku_malloc_aligned(size_t alignment, size_t size)
__attribute__((__visibility__("default")));
// xmalloc aborts instead of return NULL if we run out of memory
void *toku_xmalloc(size_t size) __attribute__((__visibility__("default")));
void *toku_xrealloc(void *, size_t size)
__attribute__((__visibility__("default")));
void *toku_xmalloc_aligned(size_t alignment, size_t size)
__attribute__((__visibility__("default")));
// Effect: Perform a os_malloc_aligned(size) with the additional property that
// the returned pointer is a multiple of ALIGNMENT.
// Fail with a resource_assert if the allocation fails (don't return an error
// code). If the alloc_aligned function has been set then call it instead.
// Requires: alignment is a power of two.
void toku_free(void *) __attribute__((__visibility__("default")));
size_t toku_malloc_usable_size(void *p)
__attribute__((__visibility__("default")));
/* MALLOC is a macro that helps avoid a common error:
* Suppose I write
* struct foo *x = malloc(sizeof(struct foo));
* That works fine. But if I change it to this, I've probably made an mistake:
* struct foo *x = malloc(sizeof(struct bar));
* It can get worse, since one might have something like
* struct foo *x = malloc(sizeof(struct foo *))
* which looks reasonable, but it allocoates enough to hold a pointer instead of
* the amount needed for the struct. So instead, write struct foo *MALLOC(x);
* and you cannot go wrong.
*/
#define MALLOC(v) CAST_FROM_VOIDP(v, toku_malloc(sizeof(*v)))
/* MALLOC_N is like calloc(Except no 0ing of data): It makes an array. Write
* int *MALLOC_N(5,x);
* to make an array of 5 integers.
*/
#define MALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_malloc((n) * sizeof(*v)))
#define MALLOC_N_ALIGNED(align, n, v) \
CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n) * sizeof(*v)))
// CALLOC_N is like calloc with auto-figuring out size of members
#define CALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v)))
#define CALLOC(v) CALLOC_N(1, v)
// XMALLOC macros are like MALLOC except they abort if the operation fails
#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v)))
#define XMALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xmalloc((n) * sizeof(*v)))
#define XCALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xcalloc((n), (sizeof(*v))))
#define XCALLOC(v) XCALLOC_N(1, v)
#define XREALLOC(v, s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s))
#define XREALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n) * sizeof(*v)))
#define XMALLOC_N_ALIGNED(align, n, v) \
CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n) * sizeof(*v)))
#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src)))
#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len))
// ZERO_ARRAY writes zeroes to a stack-allocated array
#define ZERO_ARRAY(o) \
do { \
memset((o), 0, sizeof(o)); \
} while (0)
// ZERO_STRUCT writes zeroes to a stack-allocated struct
#define ZERO_STRUCT(o) \
do { \
memset(&(o), 0, sizeof(o)); \
} while (0)
/* Copy memory. Analogous to strdup() */
void *toku_memdup(const void *v, size_t len);
/* Toku-version of strdup. Use this so that it calls toku_malloc() */
char *toku_strdup(const char *s) __attribute__((__visibility__("default")));
/* Toku-version of strndup. Use this so that it calls toku_malloc() */
char *toku_strndup(const char *s, size_t n)
__attribute__((__visibility__("default")));
/* Copy memory. Analogous to strdup() Crashes instead of returning NULL */
void *toku_xmemdup(const void *v, size_t len)
__attribute__((__visibility__("default")));
/* Toku-version of strdup. Use this so that it calls toku_xmalloc() Crashes
* instead of returning NULL */
char *toku_xstrdup(const char *s) __attribute__((__visibility__("default")));
void toku_malloc_cleanup(
void); /* Before exiting, call this function to free up any internal data
structures from toku_malloc. Otherwise valgrind will complain of
memory leaks. */
/* Check to see if everything malloc'd was free. Might be a no-op depending on
* how memory.c is configured. */
void toku_memory_check_all_free(void);
/* Check to see if memory is "sane". Might be a no-op. Probably better to
* simply use valgrind. */
void toku_do_memory_check(void);
typedef void *(*malloc_fun_t)(size_t);
typedef void (*free_fun_t)(void *);
typedef void *(*realloc_fun_t)(void *, size_t);
typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/);
typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void * /*pointer*/,
size_t /*size*/);
void toku_set_func_malloc(malloc_fun_t f);
void toku_set_func_xmalloc_only(malloc_fun_t f);
void toku_set_func_malloc_only(malloc_fun_t f);
void toku_set_func_realloc(realloc_fun_t f);
void toku_set_func_xrealloc_only(realloc_fun_t f);
void toku_set_func_realloc_only(realloc_fun_t f);
void toku_set_func_free(free_fun_t f);
typedef struct memory_status {
uint64_t malloc_count; // number of malloc operations
uint64_t free_count; // number of free operations
uint64_t realloc_count; // number of realloc operations
uint64_t malloc_fail; // number of malloc operations that failed
uint64_t realloc_fail; // number of realloc operations that failed
uint64_t requested; // number of bytes requested
uint64_t used; // number of bytes used (requested + overhead), obtained from
// malloc_usable_size()
uint64_t freed; // number of bytes freed;
uint64_t max_requested_size; // largest attempted allocation size
uint64_t last_failed_size; // size of the last failed allocation attempt
volatile uint64_t
max_in_use; // maximum memory footprint (used - freed), approximate (not
// worth threadsafety overhead for exact)
const char *mallocator_version;
uint64_t mmap_threshold;
} LOCAL_MEMORY_STATUS_S, *LOCAL_MEMORY_STATUS;
void toku_memory_get_status(LOCAL_MEMORY_STATUS s);
// Effect: Like toku_memory_footprint, except instead of passing p,
// we pass toku_malloc_usable_size(p).
size_t toku_memory_footprint_given_usable_size(size_t touched, size_t usable);
// Effect: Return an estimate how how much space an object is using, possibly by
// using toku_malloc_usable_size(p).
// If p is NULL then returns 0.
size_t toku_memory_footprint(void *p, size_t touched);

@ -0,0 +1,37 @@
//
// A replacement for toku_assert.h
//
#pragma once
#include <assert.h>
#include <errno.h>
#ifdef NDEBUG
#define assert_zero(a) ((void)(a))
#define invariant(a) ((void)(a))
#define invariant_notnull(a) ((void)(a))
#define invariant_zero(a) ((void)(a))
#else
#define assert_zero(a) assert((a) == 0)
#define invariant(a) assert(a)
#define invariant_notnull(a) assert(a)
#define invariant_zero(a) assert_zero(a)
#endif
#define lazy_assert_zero(a) assert_zero(a)
#define paranoid_invariant_zero(a) assert_zero(a)
#define paranoid_invariant_notnull(a) assert(a)
#define paranoid_invariant(a) assert(a)
#define ENSURE_POD(type) \
static_assert(std::is_pod<type>::value, #type "isn't POD")
inline int get_error_errno(void) {
invariant(errno);
return errno;
}

@ -0,0 +1,116 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
// PORT2: #include <portability/toku_config.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "toku_assert_subst.h"
__attribute__((const, always_inline)) static inline intptr_t which_cache_line(
intptr_t addr) {
static const size_t assumed_cache_line_size = 64;
return addr / assumed_cache_line_size;
}
template <typename T>
__attribute__((const, always_inline)) static inline bool crosses_boundary(
T *addr, size_t width) {
const intptr_t int_addr = reinterpret_cast<intptr_t>(addr);
const intptr_t last_byte = int_addr + width - 1;
return which_cache_line(int_addr) != which_cache_line(last_byte);
}
template <typename T, typename U>
__attribute__((always_inline)) static inline T toku_sync_fetch_and_add(T *addr,
U diff) {
paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
return __sync_fetch_and_add(addr, diff);
}
template <typename T, typename U>
__attribute__((always_inline)) static inline T toku_sync_add_and_fetch(T *addr,
U diff) {
paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
return __sync_add_and_fetch(addr, diff);
}
template <typename T, typename U>
__attribute__((always_inline)) static inline T toku_sync_fetch_and_sub(T *addr,
U diff) {
paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
return __sync_fetch_and_sub(addr, diff);
}
template <typename T, typename U>
__attribute__((always_inline)) static inline T toku_sync_sub_and_fetch(T *addr,
U diff) {
paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
return __sync_sub_and_fetch(addr, diff);
}
template <typename T, typename U, typename V>
__attribute__((always_inline)) static inline T toku_sync_val_compare_and_swap(
T *addr, U oldval, V newval) {
paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
return __sync_val_compare_and_swap(addr, oldval, newval);
}
template <typename T, typename U, typename V>
__attribute__((always_inline)) static inline bool
toku_sync_bool_compare_and_swap(T *addr, U oldval, V newval) {
paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
return __sync_bool_compare_and_swap(addr, oldval, newval);
}
// in case you include this but not toku_portability.h
#pragma GCC poison __sync_fetch_and_add
#pragma GCC poison __sync_fetch_and_sub
#pragma GCC poison __sync_fetch_and_or
#pragma GCC poison __sync_fetch_and_and
#pragma GCC poison __sync_fetch_and_xor
#pragma GCC poison __sync_fetch_and_nand
#pragma GCC poison __sync_add_and_fetch
#pragma GCC poison __sync_sub_and_fetch
#pragma GCC poison __sync_or_and_fetch
#pragma GCC poison __sync_and_and_fetch
#pragma GCC poison __sync_xor_and_fetch
#pragma GCC poison __sync_nand_and_fetch
#pragma GCC poison __sync_bool_compare_and_swap
#pragma GCC poison __sync_val_compare_and_swap
#pragma GCC poison __sync_synchronize
#pragma GCC poison __sync_lock_test_and_set
#pragma GCC poison __sync_release

@ -0,0 +1,82 @@
/*
A wrapper around rocksdb::TransactionDBMutexFactory-provided condition and
mutex that provides toku_pthread_*-like interface. The functions are named
toku_external_{mutex|cond}_XXX
Lock Tree uses this mutex and condition for interruptible (long) lock waits.
(It also still uses toku_pthread_XXX calls for mutexes/conditions for
shorter waits on internal objects)
*/
#pragma once
#include <pthread.h>
#include <stdint.h>
#include <time.h>
#include "rocksdb/utilities/transaction_db.h"
#include "rocksdb/utilities/transaction_db_mutex.h"
#include "toku_portability.h"
using ROCKSDB_NAMESPACE::TransactionDBCondVar;
using ROCKSDB_NAMESPACE::TransactionDBMutex;
typedef std::shared_ptr<ROCKSDB_NAMESPACE::TransactionDBMutexFactory>
toku_external_mutex_factory_t;
typedef std::shared_ptr<TransactionDBMutex> toku_external_mutex_t;
typedef std::shared_ptr<TransactionDBCondVar> toku_external_cond_t;
static inline void toku_external_cond_init(
toku_external_mutex_factory_t mutex_factory, toku_external_cond_t *cond) {
*cond = mutex_factory->AllocateCondVar();
}
inline void toku_external_cond_destroy(toku_external_cond_t *cond) {
cond->reset(); // this will destroy the managed object
}
inline void toku_external_cond_signal(toku_external_cond_t *cond) {
(*cond)->Notify();
}
inline void toku_external_cond_broadcast(toku_external_cond_t *cond) {
(*cond)->NotifyAll();
}
inline int toku_external_cond_timedwait(toku_external_cond_t *cond,
toku_external_mutex_t *mutex,
int64_t timeout_microsec) {
auto res = (*cond)->WaitFor(*mutex, timeout_microsec);
if (res.ok())
return 0;
else
return ETIMEDOUT;
}
inline void toku_external_mutex_init(toku_external_mutex_factory_t factory,
toku_external_mutex_t *mutex) {
// Use placement new: the memory has been allocated but constructor wasn't
// called
new (mutex) toku_external_mutex_t;
*mutex = factory->AllocateMutex();
}
inline void toku_external_mutex_lock(toku_external_mutex_t *mutex) {
(*mutex)->Lock();
}
inline int toku_external_mutex_trylock(toku_external_mutex_t *mutex) {
(*mutex)->Lock();
return 0;
}
inline void toku_external_mutex_unlock(toku_external_mutex_t *mutex) {
(*mutex)->UnLock();
}
inline void toku_external_mutex_destroy(toku_external_mutex_t *mutex) {
mutex->reset(); // this will destroy the managed object
}

@ -0,0 +1,240 @@
#pragma once
#include <stdio.h> // FILE
// Performance instrumentation object identifier type
typedef unsigned int pfs_key_t;
enum class toku_instr_object_type { mutex, rwlock, cond, thread, file };
struct PSI_file;
struct TOKU_FILE {
/** The real file. */
FILE *file;
struct PSI_file *key;
TOKU_FILE() : file(nullptr), key(nullptr) {}
};
struct PSI_mutex;
struct PSI_cond;
struct PSI_rwlock;
struct toku_mutex_t;
struct toku_cond_t;
struct toku_pthread_rwlock_t;
class toku_instr_key;
class toku_instr_probe_empty {
public:
explicit toku_instr_probe_empty(UU(const toku_instr_key &key)) {}
void start_with_source_location(UU(const char *src_file), UU(int src_line)) {}
void stop() {}
};
#define TOKU_PROBE_START(p) p->start_with_source_location(__FILE__, __LINE__)
#define TOKU_PROBE_STOP(p) p->stop
extern toku_instr_key toku_uninstrumented;
#ifndef MYSQL_TOKUDB_ENGINE
#include <pthread.h>
class toku_instr_key {
public:
toku_instr_key(UU(toku_instr_object_type type), UU(const char *group),
UU(const char *name)) {}
explicit toku_instr_key(UU(pfs_key_t key_id)) {}
// No-instrumentation constructor:
toku_instr_key() {}
~toku_instr_key() {}
};
typedef toku_instr_probe_empty toku_instr_probe;
enum class toku_instr_file_op {
file_stream_open,
file_create,
file_open,
file_delete,
file_rename,
file_read,
file_write,
file_sync,
file_stream_close,
file_close,
file_stat
};
struct PSI_file {};
struct PSI_mutex {};
struct toku_io_instrumentation {};
inline int toku_pthread_create(UU(const toku_instr_key &key), pthread_t *thread,
const pthread_attr_t *attr,
void *(*start_routine)(void *), void *arg) {
return pthread_create(thread, attr, start_routine, arg);
}
inline void toku_instr_register_current_thread() {}
inline void toku_instr_delete_current_thread() {}
// Instrument file creation, opening, closing, and renaming
inline void toku_instr_file_open_begin(UU(toku_io_instrumentation &io_instr),
UU(const toku_instr_key &key),
UU(toku_instr_file_op op),
UU(const char *name),
UU(const char *src_file),
UU(int src_line)) {}
inline void toku_instr_file_stream_open_end(
UU(toku_io_instrumentation &io_instr), UU(TOKU_FILE &file)) {}
inline void toku_instr_file_open_end(UU(toku_io_instrumentation &io_instr),
UU(int fd)) {}
inline void toku_instr_file_name_close_begin(
UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
UU(toku_instr_file_op op), UU(const char *name), UU(const char *src_file),
UU(int src_line)) {}
inline void toku_instr_file_stream_close_begin(
UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
UU(TOKU_FILE &file), UU(const char *src_file), UU(int src_line)) {}
inline void toku_instr_file_fd_close_begin(
UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
UU(int fd), UU(const char *src_file), UU(int src_line)) {}
inline void toku_instr_file_close_end(UU(toku_io_instrumentation &io_instr),
UU(int result)) {}
inline void toku_instr_file_io_begin(UU(toku_io_instrumentation &io_instr),
UU(toku_instr_file_op op), UU(int fd),
UU(unsigned int count),
UU(const char *src_file),
UU(int src_line)) {}
inline void toku_instr_file_name_io_begin(
UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
UU(toku_instr_file_op op), UU(const char *name), UU(unsigned int count),
UU(const char *src_file), UU(int src_line)) {}
inline void toku_instr_file_stream_io_begin(
UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
UU(TOKU_FILE &file), UU(unsigned int count), UU(const char *src_file),
UU(int src_line)) {}
inline void toku_instr_file_io_end(UU(toku_io_instrumentation &io_instr),
UU(unsigned int count)) {}
struct toku_mutex_t;
struct toku_mutex_instrumentation {};
inline PSI_mutex *toku_instr_mutex_init(UU(const toku_instr_key &key),
UU(toku_mutex_t &mutex)) {
return nullptr;
}
inline void toku_instr_mutex_destroy(UU(PSI_mutex *&mutex_instr)) {}
inline void toku_instr_mutex_lock_start(
UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
UU(const char *src_file), UU(int src_line)) {}
inline void toku_instr_mutex_trylock_start(
UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
UU(const char *src_file), UU(int src_line)) {}
inline void toku_instr_mutex_lock_end(
UU(toku_mutex_instrumentation &mutex_instr),
UU(int pthread_mutex_lock_result)) {}
inline void toku_instr_mutex_unlock(UU(PSI_mutex *mutex_instr)) {}
struct toku_cond_instrumentation {};
enum class toku_instr_cond_op {
cond_wait,
cond_timedwait,
};
inline PSI_cond *toku_instr_cond_init(UU(const toku_instr_key &key),
UU(toku_cond_t &cond)) {
return nullptr;
}
inline void toku_instr_cond_destroy(UU(PSI_cond *&cond_instr)) {}
inline void toku_instr_cond_wait_start(
UU(toku_cond_instrumentation &cond_instr), UU(toku_instr_cond_op op),
UU(toku_cond_t &cond), UU(toku_mutex_t &mutex), UU(const char *src_file),
UU(int src_line)) {}
inline void toku_instr_cond_wait_end(UU(toku_cond_instrumentation &cond_instr),
UU(int pthread_cond_wait_result)) {}
inline void toku_instr_cond_signal(UU(toku_cond_t &cond)) {}
inline void toku_instr_cond_broadcast(UU(toku_cond_t &cond)) {}
#if 0
// rw locks are not used
// rwlock instrumentation
struct toku_rwlock_instrumentation {};
inline PSI_rwlock *toku_instr_rwlock_init(UU(const toku_instr_key &key),
UU(toku_pthread_rwlock_t &rwlock)) {
return nullptr;
}
inline void toku_instr_rwlock_destroy(UU(PSI_rwlock *&rwlock_instr)) {}
inline void toku_instr_rwlock_rdlock_wait_start(
UU(toku_rwlock_instrumentation &rwlock_instr),
UU(toku_pthread_rwlock_t &rwlock),
UU(const char *src_file),
UU(int src_line)) {}
inline void toku_instr_rwlock_wrlock_wait_start(
UU(toku_rwlock_instrumentation &rwlock_instr),
UU(toku_pthread_rwlock_t &rwlock),
UU(const char *src_file),
UU(int src_line)) {}
inline void toku_instr_rwlock_rdlock_wait_end(
UU(toku_rwlock_instrumentation &rwlock_instr),
UU(int pthread_rwlock_wait_result)) {}
inline void toku_instr_rwlock_wrlock_wait_end(
UU(toku_rwlock_instrumentation &rwlock_instr),
UU(int pthread_rwlock_wait_result)) {}
inline void toku_instr_rwlock_unlock(UU(toku_pthread_rwlock_t &rwlock)) {}
#endif
#else // MYSQL_TOKUDB_ENGINE
// There can be not only mysql but also mongodb or any other PFS stuff
#include <toku_instr_mysql.h>
#endif // MYSQL_TOKUDB_ENGINE
// Mutexes
extern toku_instr_key manager_escalation_mutex_key;
extern toku_instr_key manager_escalator_mutex_key;
extern toku_instr_key manager_mutex_key;
extern toku_instr_key treenode_mutex_key;
extern toku_instr_key locktree_request_info_mutex_key;
extern toku_instr_key locktree_request_info_retry_mutex_key;
// condition vars
extern toku_instr_key lock_request_m_wait_cond_key;
extern toku_instr_key locktree_request_info_retry_cv_key;
extern toku_instr_key manager_m_escalator_done_key; // unused

@ -0,0 +1,73 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#if defined(__clang__)
#define constexpr_static_assert(a, b)
#else
#define constexpr_static_assert(a, b) static_assert(a, b)
#endif
// include here, before they get deprecated
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#include "toku_atomic.h"
#if defined(__cplusplus)
#include <type_traits>
#endif
#if defined(__cplusplus)
// decltype() here gives a reference-to-pointer instead of just a pointer,
// just use __typeof__
#define CAST_FROM_VOIDP(name, value) name = static_cast<__typeof__(name)>(value)
#else
#define CAST_FROM_VOIDP(name, value) name = cast_to_typeof(name)(value)
#endif
#define UU(x) x __attribute__((__unused__))
#include "toku_instrumentation.h"

@ -0,0 +1,501 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <pthread.h>
#include <stdint.h>
#include <time.h>
#include "toku_portability.h"
// PORT2: #include "toku_assert.h"
// TODO: some things moved toku_instrumentation.h, not necessarily the best
// place
typedef pthread_attr_t toku_pthread_attr_t;
typedef pthread_t toku_pthread_t;
typedef pthread_mutex_t toku_pthread_mutex_t;
typedef pthread_condattr_t toku_pthread_condattr_t;
typedef pthread_cond_t toku_pthread_cond_t;
typedef pthread_rwlockattr_t toku_pthread_rwlockattr_t;
typedef pthread_key_t toku_pthread_key_t;
typedef struct timespec toku_timespec_t;
// TODO: break this include loop
#include <pthread.h>
typedef pthread_mutexattr_t toku_pthread_mutexattr_t;
struct toku_mutex_t {
pthread_mutex_t pmutex;
struct PSI_mutex *psi_mutex; /* The performance schema instrumentation hook */
#if defined(TOKU_PTHREAD_DEBUG)
pthread_t owner; // = pthread_self(); // for debugging
bool locked;
bool valid;
pfs_key_t instr_key_id;
#endif // defined(TOKU_PTHREAD_DEBUG)
};
struct toku_cond_t {
pthread_cond_t pcond;
struct PSI_cond *psi_cond;
#if defined(TOKU_PTHREAD_DEBUG)
pfs_key_t instr_key_id;
#endif // defined(TOKU_PTHREAD_DEBUG)
};
#if defined(TOKU_PTHREAD_DEBUG)
#define TOKU_COND_INITIALIZER \
{ .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr, .instr_key_id = 0 }
#else
#define TOKU_COND_INITIALIZER \
{ .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr }
#endif // defined(TOKU_PTHREAD_DEBUG)
struct toku_pthread_rwlock_t {
pthread_rwlock_t rwlock;
struct PSI_rwlock *psi_rwlock;
#if defined(TOKU_PTHREAD_DEBUG)
pfs_key_t instr_key_id;
#endif // defined(TOKU_PTHREAD_DEBUG)
};
typedef struct toku_mutex_aligned {
toku_mutex_t aligned_mutex __attribute__((__aligned__(64)));
} toku_mutex_aligned_t;
// Initializing with {} will fill in a struct with all zeros.
// But you may also need a pragma to suppress the warnings, as follows
//
// #pragma GCC diagnostic push
// #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
// toku_mutex_t foo = ZERO_MUTEX_INITIALIZER;
// #pragma GCC diagnostic pop
//
// In general it will be a lot of busy work to make this codebase compile
// cleanly with -Wmissing-field-initializers
#define ZERO_MUTEX_INITIALIZER \
{}
#if defined(TOKU_PTHREAD_DEBUG)
#define TOKU_MUTEX_INITIALIZER \
{ \
.pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
.locked = false, .valid = true, .instr_key_id = 0 \
}
#else
#define TOKU_MUTEX_INITIALIZER \
{ .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
#endif // defined(TOKU_PTHREAD_DEBUG)
// Darwin doesn't provide adaptive mutexes
#if defined(__APPLE__)
#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT
#if defined(TOKU_PTHREAD_DEBUG)
#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
{ \
.pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
.locked = false, .valid = true, .instr_key_id = 0 \
}
#else
#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
{ .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
#endif // defined(TOKU_PTHREAD_DEBUG)
#else // __FreeBSD__, __linux__, at least
#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_ADAPTIVE_NP
#if defined(TOKU_PTHREAD_DEBUG)
#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
{ \
.pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr, \
.owner = 0, .locked = false, .valid = true, .instr_key_id = 0 \
}
#else
#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
{ .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr }
#endif // defined(TOKU_PTHREAD_DEBUG)
#endif // defined(__APPLE__)
// Different OSes implement mutexes as different amounts of nested structs.
// C++ will fill out all missing values with zeroes if you provide at least one
// zero, but it needs the right amount of nesting.
#if defined(__FreeBSD__)
#define ZERO_COND_INITIALIZER \
{ 0 }
#elif defined(__APPLE__)
#define ZERO_COND_INITIALIZER \
{ \
{ 0 } \
}
#else // __linux__, at least
#define ZERO_COND_INITIALIZER \
{}
#endif
static inline void toku_mutexattr_init(toku_pthread_mutexattr_t *attr) {
int r = pthread_mutexattr_init(attr);
assert_zero(r);
}
static inline void toku_mutexattr_settype(toku_pthread_mutexattr_t *attr,
int type) {
int r = pthread_mutexattr_settype(attr, type);
assert_zero(r);
}
static inline void toku_mutexattr_destroy(toku_pthread_mutexattr_t *attr) {
int r = pthread_mutexattr_destroy(attr);
assert_zero(r);
}
#if defined(TOKU_PTHREAD_DEBUG)
static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex) {
invariant(mutex->locked);
invariant(mutex->owner == pthread_self());
}
#else
static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex
__attribute__((unused))) {}
#endif // defined(TOKU_PTHREAD_DEBUG)
// asserting that a mutex is unlocked only makes sense
// if the calling thread can guaruntee that no other threads
// are trying to lock this mutex at the time of the assertion
//
// a good example of this is a tree with mutexes on each node.
// when a node is locked the caller knows that no other threads
// can be trying to lock its childrens' mutexes. the children
// are in one of two fixed states: locked or unlocked.
#if defined(TOKU_PTHREAD_DEBUG)
static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex) {
invariant(mutex->owner == 0);
invariant(!mutex->locked);
}
#else
static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex
__attribute__((unused))) {}
#endif // defined(TOKU_PTHREAD_DEBUG)
#define toku_mutex_lock(M) \
toku_mutex_lock_with_source_location(M, __FILE__, __LINE__)
static inline void toku_cond_init(toku_cond_t *cond,
const toku_pthread_condattr_t *attr) {
int r = pthread_cond_init(&cond->pcond, attr);
assert_zero(r);
}
#define toku_mutex_trylock(M) \
toku_mutex_trylock_with_source_location(M, __FILE__, __LINE__)
inline void toku_mutex_unlock(toku_mutex_t *mutex) {
#if defined(TOKU_PTHREAD_DEBUG)
invariant(mutex->owner == pthread_self());
invariant(mutex->valid);
invariant(mutex->locked);
mutex->locked = false;
mutex->owner = 0;
#endif // defined(TOKU_PTHREAD_DEBUG)
toku_instr_mutex_unlock(mutex->psi_mutex);
int r = pthread_mutex_unlock(&mutex->pmutex);
assert_zero(r);
}
inline void toku_mutex_lock_with_source_location(toku_mutex_t *mutex,
const char *src_file,
int src_line) {
toku_mutex_instrumentation mutex_instr;
toku_instr_mutex_lock_start(mutex_instr, *mutex, src_file, src_line);
const int r = pthread_mutex_lock(&mutex->pmutex);
toku_instr_mutex_lock_end(mutex_instr, r);
assert_zero(r);
#if defined(TOKU_PTHREAD_DEBUG)
invariant(mutex->valid);
invariant(!mutex->locked);
invariant(mutex->owner == 0);
mutex->locked = true;
mutex->owner = pthread_self();
#endif // defined(TOKU_PTHREAD_DEBUG)
}
inline int toku_mutex_trylock_with_source_location(toku_mutex_t *mutex,
const char *src_file,
int src_line) {
toku_mutex_instrumentation mutex_instr;
toku_instr_mutex_trylock_start(mutex_instr, *mutex, src_file, src_line);
const int r = pthread_mutex_lock(&mutex->pmutex);
toku_instr_mutex_lock_end(mutex_instr, r);
#if defined(TOKU_PTHREAD_DEBUG)
if (r == 0) {
invariant(mutex->valid);
invariant(!mutex->locked);
invariant(mutex->owner == 0);
mutex->locked = true;
mutex->owner = pthread_self();
}
#endif // defined(TOKU_PTHREAD_DEBUG)
return r;
}
#define toku_cond_wait(C, M) \
toku_cond_wait_with_source_location(C, M, __FILE__, __LINE__)
#define toku_cond_timedwait(C, M, W) \
toku_cond_timedwait_with_source_location(C, M, W, __FILE__, __LINE__)
inline void toku_cond_init(const toku_instr_key &key, toku_cond_t *cond,
const pthread_condattr_t *attr) {
toku_instr_cond_init(key, *cond);
int r = pthread_cond_init(&cond->pcond, attr);
assert_zero(r);
}
inline void toku_cond_destroy(toku_cond_t *cond) {
toku_instr_cond_destroy(cond->psi_cond);
int r = pthread_cond_destroy(&cond->pcond);
assert_zero(r);
}
inline void toku_cond_wait_with_source_location(toku_cond_t *cond,
toku_mutex_t *mutex,
const char *src_file,
int src_line) {
#if defined(TOKU_PTHREAD_DEBUG)
invariant(mutex->locked);
mutex->locked = false;
mutex->owner = 0;
#endif // defined(TOKU_PTHREAD_DEBUG)
/* Instrumentation start */
toku_cond_instrumentation cond_instr;
toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_wait, *cond,
*mutex, src_file, src_line);
/* Instrumented code */
const int r = pthread_cond_wait(&cond->pcond, &mutex->pmutex);
/* Instrumentation end */
toku_instr_cond_wait_end(cond_instr, r);
assert_zero(r);
#if defined(TOKU_PTHREAD_DEBUG)
invariant(!mutex->locked);
mutex->locked = true;
mutex->owner = pthread_self();
#endif // defined(TOKU_PTHREAD_DEBUG)
}
inline int toku_cond_timedwait_with_source_location(toku_cond_t *cond,
toku_mutex_t *mutex,
toku_timespec_t *wakeup_at,
const char *src_file,
int src_line) {
#if defined(TOKU_PTHREAD_DEBUG)
invariant(mutex->locked);
mutex->locked = false;
mutex->owner = 0;
#endif // defined(TOKU_PTHREAD_DEBUG)
/* Instrumentation start */
toku_cond_instrumentation cond_instr;
toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_timedwait,
*cond, *mutex, src_file, src_line);
/* Instrumented code */
const int r = pthread_cond_timedwait(&cond->pcond, &mutex->pmutex, wakeup_at);
/* Instrumentation end */
toku_instr_cond_wait_end(cond_instr, r);
#if defined(TOKU_PTHREAD_DEBUG)
invariant(!mutex->locked);
mutex->locked = true;
mutex->owner = pthread_self();
#endif // defined(TOKU_PTHREAD_DEBUG)
return r;
}
inline void toku_cond_signal(toku_cond_t *cond) {
toku_instr_cond_signal(*cond);
const int r = pthread_cond_signal(&cond->pcond);
assert_zero(r);
}
inline void toku_cond_broadcast(toku_cond_t *cond) {
toku_instr_cond_broadcast(*cond);
const int r = pthread_cond_broadcast(&cond->pcond);
assert_zero(r);
}
inline void toku_mutex_init(const toku_instr_key &key, toku_mutex_t *mutex,
const toku_pthread_mutexattr_t *attr) {
#if defined(TOKU_PTHREAD_DEBUG)
mutex->valid = true;
#endif // defined(TOKU_PTHREAD_DEBUG)
toku_instr_mutex_init(key, *mutex);
const int r = pthread_mutex_init(&mutex->pmutex, attr);
assert_zero(r);
#if defined(TOKU_PTHREAD_DEBUG)
mutex->locked = false;
invariant(mutex->valid);
mutex->valid = true;
mutex->owner = 0;
#endif // defined(TOKU_PTHREAD_DEBUG)
}
inline void toku_mutex_destroy(toku_mutex_t *mutex) {
#if defined(TOKU_PTHREAD_DEBUG)
invariant(mutex->valid);
mutex->valid = false;
invariant(!mutex->locked);
#endif // defined(TOKU_PTHREAD_DEBUG)
toku_instr_mutex_destroy(mutex->psi_mutex);
int r = pthread_mutex_destroy(&mutex->pmutex);
assert_zero(r);
}
#define toku_pthread_rwlock_rdlock(RW) \
toku_pthread_rwlock_rdlock_with_source_location(RW, __FILE__, __LINE__)
#define toku_pthread_rwlock_wrlock(RW) \
toku_pthread_rwlock_wrlock_with_source_location(RW, __FILE__, __LINE__)
#if 0
inline void toku_pthread_rwlock_init(
const toku_instr_key &key,
toku_pthread_rwlock_t *__restrict rwlock,
const toku_pthread_rwlockattr_t *__restrict attr) {
toku_instr_rwlock_init(key, *rwlock);
int r = pthread_rwlock_init(&rwlock->rwlock, attr);
assert_zero(r);
}
inline void toku_pthread_rwlock_destroy(toku_pthread_rwlock_t *rwlock) {
toku_instr_rwlock_destroy(rwlock->psi_rwlock);
int r = pthread_rwlock_destroy(&rwlock->rwlock);
assert_zero(r);
}
inline void toku_pthread_rwlock_rdlock_with_source_location(
toku_pthread_rwlock_t *rwlock,
const char *src_file,
uint src_line) {
/* Instrumentation start */
toku_rwlock_instrumentation rwlock_instr;
toku_instr_rwlock_rdlock_wait_start(
rwlock_instr, *rwlock, src_file, src_line);
/* Instrumented code */
const int r = pthread_rwlock_rdlock(&rwlock->rwlock);
/* Instrumentation end */
toku_instr_rwlock_rdlock_wait_end(rwlock_instr, r);
assert_zero(r);
}
inline void toku_pthread_rwlock_wrlock_with_source_location(
toku_pthread_rwlock_t *rwlock,
const char *src_file,
uint src_line) {
/* Instrumentation start */
toku_rwlock_instrumentation rwlock_instr;
toku_instr_rwlock_wrlock_wait_start(
rwlock_instr, *rwlock, src_file, src_line);
/* Instrumented code */
const int r = pthread_rwlock_wrlock(&rwlock->rwlock);
/* Instrumentation end */
toku_instr_rwlock_wrlock_wait_end(rwlock_instr, r);
assert_zero(r);
}
inline void toku_pthread_rwlock_rdunlock(toku_pthread_rwlock_t *rwlock) {
toku_instr_rwlock_unlock(*rwlock);
const int r = pthread_rwlock_unlock(&rwlock->rwlock);
assert_zero(r);
}
inline void toku_pthread_rwlock_wrunlock(toku_pthread_rwlock_t *rwlock) {
toku_instr_rwlock_unlock(*rwlock);
const int r = pthread_rwlock_unlock(&rwlock->rwlock);
assert_zero(r);
}
#endif
static inline int toku_pthread_join(toku_pthread_t thread, void **value_ptr) {
return pthread_join(thread, value_ptr);
}
static inline int toku_pthread_detach(toku_pthread_t thread) {
return pthread_detach(thread);
}
static inline int toku_pthread_key_create(toku_pthread_key_t *key,
void (*destroyf)(void *)) {
return pthread_key_create(key, destroyf);
}
static inline int toku_pthread_key_delete(toku_pthread_key_t key) {
return pthread_key_delete(key);
}
static inline void *toku_pthread_getspecific(toku_pthread_key_t key) {
return pthread_getspecific(key);
}
static inline int toku_pthread_setspecific(toku_pthread_key_t key, void *data) {
return pthread_setspecific(key, data);
}
int toku_pthread_yield(void) __attribute__((__visibility__("default")));
static inline toku_pthread_t toku_pthread_self(void) { return pthread_self(); }
static inline void *toku_pthread_done(void *exit_value) {
toku_instr_delete_current_thread();
pthread_exit(exit_value);
return nullptr; // Avoid compiler warning
}

@ -0,0 +1,165 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
// PORT2: #include <portability/toku_config.h>
#ifdef HAVE_valgrind
#undef USE_VALGRIND
#define USE_VALGRIND 1
#endif
#if defined(__linux__) && USE_VALGRIND
#include <valgrind/drd.h>
#include <valgrind/helgrind.h>
#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size)
#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) \
VALGRIND_HG_ENABLE_CHECKING(p, size)
#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) \
VALGRIND_HG_DISABLE_CHECKING(p, size)
#define TOKU_DRD_IGNORE_VAR(v) DRD_IGNORE_VAR(v)
#define TOKU_DRD_STOP_IGNORING_VAR(v) DRD_STOP_IGNORING_VAR(v)
#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ANNOTATE_IGNORE_READS_BEGIN()
#define TOKU_ANNOTATE_IGNORE_READS_END() ANNOTATE_IGNORE_READS_END()
#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ANNOTATE_IGNORE_WRITES_BEGIN()
#define TOKU_ANNOTATE_IGNORE_WRITES_END() ANNOTATE_IGNORE_WRITES_END()
/*
* How to make helgrind happy about tree rotations and new mutex orderings:
*
* // Tell helgrind that we unlocked it so that the next call doesn't get a
* "destroyed a locked mutex" error.
* // Tell helgrind that we destroyed the mutex.
* VALGRIND_HG_MUTEX_UNLOCK_PRE(&locka);
* VALGRIND_HG_MUTEX_DESTROY_PRE(&locka);
*
* // And recreate it. It would be better to simply be able to say that the
* order on these two can now be reversed, because this code forgets all the
* ordering information for this mutex.
* // Then tell helgrind that we have locked it again.
* VALGRIND_HG_MUTEX_INIT_POST(&locka, 0);
* VALGRIND_HG_MUTEX_LOCK_POST(&locka);
*
* When the ordering of two locks changes, we don't need tell Helgrind about do
* both locks. Just one is good enough.
*/
#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) \
VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex); \
VALGRIND_HG_MUTEX_DESTROY_PRE(mutex); \
VALGRIND_HG_MUTEX_INIT_POST(mutex, 0); \
VALGRIND_HG_MUTEX_LOCK_POST(mutex);
#else // !defined(__linux__) || !USE_VALGRIND
#define NVALGRIND 1
#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ((void)0)
#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void)0)
#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void)0)
#define TOKU_DRD_IGNORE_VAR(v)
#define TOKU_DRD_STOP_IGNORING_VAR(v)
#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ((void)0)
#define TOKU_ANNOTATE_IGNORE_READS_END() ((void)0)
#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ((void)0)
#define TOKU_ANNOTATE_IGNORE_WRITES_END() ((void)0)
#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex)
#undef RUNNING_ON_VALGRIND
#define RUNNING_ON_VALGRIND (0U)
#endif
// Valgrind 3.10.1 (and previous versions).
// Problems with VALGRIND_HG_DISABLE_CHECKING and VALGRIND_HG_ENABLE_CHECKING.
// Helgrind's implementation of disable and enable checking causes false races
// to be reported. In addition, the race report does not include ANY
// information about the code that uses the helgrind disable and enable
// functions. Therefore, it is very difficult to figure out the cause of the
// race. DRD does implement the disable and enable functions.
// Problems with ANNOTATE_IGNORE_READS.
// Helgrind does not implement ignore reads.
// Annotate ignore reads is the way to inform DRD to ignore racy reads.
// FT code uses unsafe reads in several places. These unsafe reads have been
// noted as valid since they use the toku_unsafe_fetch function. Unfortunately,
// this causes helgrind to report erroneous data races which makes use of
// helgrind problematic.
// Unsafely fetch and return a `T' from src, telling drd to ignore
// racey access to src for the next sizeof(*src) bytes
template <typename T>
T toku_unsafe_fetch(T *src) {
if (0)
TOKU_VALGRIND_HG_DISABLE_CHECKING(src,
sizeof *src); // disabled, see comment
TOKU_ANNOTATE_IGNORE_READS_BEGIN();
T r = *src;
TOKU_ANNOTATE_IGNORE_READS_END();
if (0)
TOKU_VALGRIND_HG_ENABLE_CHECKING(src,
sizeof *src); // disabled, see comment
return r;
}
template <typename T>
T toku_unsafe_fetch(T &src) {
return toku_unsafe_fetch(&src);
}
// Unsafely set a `T' value into *dest from src, telling drd to ignore
// racey access to dest for the next sizeof(*dest) bytes
template <typename T>
void toku_unsafe_set(T *dest, const T src) {
if (0)
TOKU_VALGRIND_HG_DISABLE_CHECKING(dest,
sizeof *dest); // disabled, see comment
TOKU_ANNOTATE_IGNORE_WRITES_BEGIN();
*dest = src;
TOKU_ANNOTATE_IGNORE_WRITES_END();
if (0)
TOKU_VALGRIND_HG_ENABLE_CHECKING(dest,
sizeof *dest); // disabled, see comment
}
template <typename T>
void toku_unsafe_set(T &dest, const T src) {
toku_unsafe_set(&dest, src);
}

@ -0,0 +1,158 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
// PORT2: #include "toku_config.h"
#include <stdint.h>
#include <sys/time.h>
#include <time.h>
#if defined(__powerpc__)
#include <sys/platform/ppc.h>
#endif
#if 0
static inline float toku_tdiff (struct timeval *a, struct timeval *b) {
return (float)((a->tv_sec - b->tv_sec) + 1e-6 * (a->tv_usec - b->tv_usec));
}
// PORT2: temporary:
#define HAVE_CLOCK_REALTIME
#if !defined(HAVE_CLOCK_REALTIME)
// OS X does not have clock_gettime, we fake clockid_t for the interface, and we'll implement it with clock_get_time.
typedef int clockid_t;
// just something bogus, it doesn't matter, we just want to make sure we're
// only supporting this mode because we're not sure we can support other modes
// without a real clock_gettime()
#define CLOCK_REALTIME 0x01867234
#endif
int toku_clock_gettime(clockid_t clk_id, struct timespec *ts) __attribute__((__visibility__("default")));
#endif
// *************** Performance timers ************************
// What do you really want from a performance timer:
// (1) Can determine actual time of day from the performance time.
// (2) Time goes forward, never backward.
// (3) Same time on different processors (or even different machines).
// (4) Time goes forward at a constant rate (doesn't get faster and slower)
// (5) Portable.
// (6) Getting the time is cheap.
// Unfortuately it seems tough to get Properties 1-5. So we go for Property 6,,
// but we abstract it. We offer a type tokutime_t which can hold the time. This
// type can be subtracted to get a time difference. We can get the present time
// cheaply. We can convert this type to seconds (but that can be expensive). The
// implementation is to use RDTSC (hence we lose property 3: not portable).
// Recent machines have constant_tsc in which case we get property (4).
// Recent OSs on recent machines (that have RDTSCP) fix the per-processor clock
// skew, so we get property (3). We get property 2 with RDTSC (as long as
// there's not any skew). We don't even try to get propety 1, since we don't
// need it. The decision here is that these times are really accurate only on
// modern machines with modern OSs.
typedef uint64_t tokutime_t; // Time type used in by tokutek timers.
#if 0
// The value of tokutime_t is not specified here.
// It might be microseconds since 1/1/1970 (if gettimeofday() is
// used), or clock cycles since boot (if rdtsc is used). Or something
// else.
// Two tokutime_t values can be subtracted to get a time difference.
// Use tokutime_to_seconds to that convert difference to seconds.
// We want get_tokutime() to be fast, but don't care so much about tokutime_to_seconds();
//
// For accurate time calculations do the subtraction in the right order:
// Right: tokutime_to_seconds(t1-t2);
// Wrong tokutime_to_seconds(t1)-toku_time_to_seconds(t2);
// Doing it the wrong way is likely to result in loss of precision.
// A double can hold numbers up to about 53 bits. RDTSC which uses about 33 bits every second, so that leaves
// 2^20 seconds from booting (about 2 weeks) before the RDTSC value cannot be represented accurately as a double.
//
double tokutime_to_seconds(tokutime_t) __attribute__((__visibility__("default"))); // Convert tokutime to seconds.
#endif
// Get the value of tokutime for right now. We want this to be fast, so we
// expose the implementation as RDTSC.
static inline tokutime_t toku_time_now(void) {
#if defined(__x86_64__) || defined(__i386__)
uint32_t lo, hi;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
return (uint64_t)hi << 32 | lo;
#elif defined(__aarch64__)
uint64_t result;
__asm __volatile__("mrs %[rt], cntvct_el0" : [ rt ] "=r"(result));
return result;
#elif defined(__powerpc__)
return __ppc_get_timebase();
#else
#error No timer implementation for this platform
#endif
}
static inline uint64_t toku_current_time_microsec(void) {
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec;
}
#if 0
// sleep microseconds
static inline void toku_sleep_microsec(uint64_t ms) {
struct timeval t;
t.tv_sec = ms / 1000000;
t.tv_usec = ms % 1000000;
select(0, NULL, NULL, NULL, &t);
}
#endif
/*
PORT: Usage of this file:
uint64_t toku_current_time_microsec() // uses gettimeoday
is used to track how much time various operations took (for example, lock
escalation). (TODO: it is not clear why these operations are tracked with
microsecond precision while others use nanoseconds)
tokutime_t toku_time_now() // uses rdtsc
seems to be used for a very similar purpose. This has greater precision
RocksDB environment provides Env::Default()->NowMicros() and NowNanos() which
should be adequate substitutes.
*/

@ -0,0 +1,27 @@
//
// A substitute for ft/txn/txn.h
//
#pragma once
#include <set>
#include "../util/omt.h"
typedef uint64_t TXNID;
#define TXNID_NONE ((TXNID)0)
// A set of transactions
// (TODO: consider using class toku::txnid_set. The reason for using STL
// container was that its API is easier)
class TxnidVector : public std::set<TXNID> {
public:
bool contains(TXNID txnid) { return find(txnid) != end(); }
};
// A value for lock structures with a meaning "the lock is owned by multiple
// transactions (and one has to check the TxnidVector to get their ids)
#define TXNID_SHARED (TXNID(-1))
// Auxiliary value meaning "any transaction id will do". No real transaction
// may have this is as id.
#define TXNID_ANY (TXNID(-2))

@ -0,0 +1,132 @@
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
/*
This is a dump ground to make Lock Tree work without the rest of TokuDB.
*/
#include <string.h>
#include "db.h"
#include "ft/ft-status.h"
#include "portability/memory.h"
#include "util/dbt.h"
// portability/os_malloc.cc
void toku_free(void *p) { free(p); }
void *toku_xmalloc(size_t size) { return malloc(size); }
void *toku_xrealloc(void *v, size_t size) { return realloc(v, size); }
void *toku_xmemdup(const void *v, size_t len) {
void *p = toku_xmalloc(len);
memcpy(p, v, len);
return p;
}
// TODO: what are the X-functions? Xcalloc, Xrealloc?
void *toku_xcalloc(size_t nmemb, size_t size) { return calloc(nmemb, size); }
// ft-ft-opts.cc:
// locktree
toku_instr_key lock_request_m_wait_cond_key;
toku_instr_key manager_m_escalator_done_key;
toku_instr_key locktree_request_info_mutex_key;
toku_instr_key locktree_request_info_retry_mutex_key;
toku_instr_key locktree_request_info_retry_cv_key;
toku_instr_key treenode_mutex_key;
toku_instr_key manager_mutex_key;
toku_instr_key manager_escalation_mutex_key;
toku_instr_key manager_escalator_mutex_key;
// portability/memory.cc
size_t toku_memory_footprint(void *, size_t touched) { return touched; }
// ft/ft-status.c
// PORT2: note: the @c parameter to TOKUFT_STATUS_INIT must not start with
// "TOKU"
LTM_STATUS_S ltm_status;
void LTM_STATUS_S::init() {
if (m_initialized) return;
#define LTM_STATUS_INIT(k, c, t, l) \
TOKUFT_STATUS_INIT((*this), k, c, t, "locktree: " l, \
TOKU_ENGINE_STATUS | TOKU_GLOBAL_STATUS)
LTM_STATUS_INIT(LTM_SIZE_CURRENT, LOCKTREE_MEMORY_SIZE, STATUS_UINT64,
"memory size");
LTM_STATUS_INIT(LTM_SIZE_LIMIT, LOCKTREE_MEMORY_SIZE_LIMIT, STATUS_UINT64,
"memory size limit");
LTM_STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, STATUS_UINT64,
"number of times lock escalation ran");
LTM_STATUS_INIT(LTM_ESCALATION_TIME, LOCKTREE_ESCALATION_SECONDS,
STATUS_TOKUTIME, "time spent running escalation (seconds)");
LTM_STATUS_INIT(LTM_ESCALATION_LATEST_RESULT,
LOCKTREE_LATEST_POST_ESCALATION_MEMORY_SIZE, STATUS_UINT64,
"latest post-escalation memory size");
LTM_STATUS_INIT(LTM_NUM_LOCKTREES, LOCKTREE_OPEN_CURRENT, STATUS_UINT64,
"number of locktrees open now");
LTM_STATUS_INIT(LTM_LOCK_REQUESTS_PENDING, LOCKTREE_PENDING_LOCK_REQUESTS,
STATUS_UINT64, "number of pending lock requests");
LTM_STATUS_INIT(LTM_STO_NUM_ELIGIBLE, LOCKTREE_STO_ELIGIBLE_NUM,
STATUS_UINT64, "number of locktrees eligible for the STO");
LTM_STATUS_INIT(LTM_STO_END_EARLY_COUNT, LOCKTREE_STO_ENDED_NUM,
STATUS_UINT64,
"number of times a locktree ended the STO early");
LTM_STATUS_INIT(LTM_STO_END_EARLY_TIME, LOCKTREE_STO_ENDED_SECONDS,
STATUS_TOKUTIME, "time spent ending the STO early (seconds)");
LTM_STATUS_INIT(LTM_WAIT_COUNT, LOCKTREE_WAIT_COUNT, STATUS_UINT64,
"number of wait locks");
LTM_STATUS_INIT(LTM_WAIT_TIME, LOCKTREE_WAIT_TIME, STATUS_UINT64,
"time waiting for locks");
LTM_STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, STATUS_UINT64,
"number of long wait locks");
LTM_STATUS_INIT(LTM_LONG_WAIT_TIME, LOCKTREE_LONG_WAIT_TIME, STATUS_UINT64,
"long time waiting for locks");
LTM_STATUS_INIT(LTM_TIMEOUT_COUNT, LOCKTREE_TIMEOUT_COUNT, STATUS_UINT64,
"number of lock timeouts");
LTM_STATUS_INIT(LTM_WAIT_ESCALATION_COUNT, LOCKTREE_WAIT_ESCALATION_COUNT,
STATUS_UINT64, "number of waits on lock escalation");
LTM_STATUS_INIT(LTM_WAIT_ESCALATION_TIME, LOCKTREE_WAIT_ESCALATION_TIME,
STATUS_UINT64, "time waiting on lock escalation");
LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_COUNT,
LOCKTREE_LONG_WAIT_ESCALATION_COUNT, STATUS_UINT64,
"number of long waits on lock escalation");
LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_TIME,
LOCKTREE_LONG_WAIT_ESCALATION_TIME, STATUS_UINT64,
"long time waiting on lock escalation");
m_initialized = true;
#undef LTM_STATUS_INIT
}
void LTM_STATUS_S::destroy() {
if (!m_initialized) return;
for (int i = 0; i < LTM_STATUS_NUM_ROWS; ++i) {
if (status[i].type == STATUS_PARCOUNT) {
// PORT: TODO?? destroy_partitioned_counter(status[i].value.parcount);
}
}
}
int toku_keycompare(const void *key1, size_t key1len, const void *key2,
size_t key2len) {
size_t comparelen = key1len < key2len ? key1len : key2len;
int c = memcmp(key1, key2, comparelen);
if (__builtin_expect(c != 0, 1)) {
return c;
} else {
if (key1len < key2len) {
return -1;
} else if (key1len > key2len) {
return 1;
} else {
return 0;
}
}
}
int toku_builtin_compare_fun(const DBT *a, const DBT *b) {
return toku_keycompare(a->data, a->size, b->data, b->size);
}
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,153 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "dbt.h"
#include <string.h>
#include "../db.h"
#include "../portability/memory.h"
DBT *toku_init_dbt(DBT *dbt) {
memset(dbt, 0, sizeof(*dbt));
return dbt;
}
DBT toku_empty_dbt(void) {
static const DBT empty_dbt = {.data = 0, .size = 0, .ulen = 0, .flags = 0};
return empty_dbt;
}
DBT *toku_init_dbt_flags(DBT *dbt, uint32_t flags) {
toku_init_dbt(dbt);
dbt->flags = flags;
return dbt;
}
void toku_destroy_dbt(DBT *dbt) {
switch (dbt->flags) {
case DB_DBT_MALLOC:
case DB_DBT_REALLOC:
toku_free(dbt->data);
toku_init_dbt(dbt);
break;
}
}
DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len) {
toku_init_dbt(dbt);
dbt->size = len;
dbt->data = (char *)k;
return dbt;
}
DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len) {
toku_init_dbt_flags(dbt, DB_DBT_MALLOC);
dbt->size = len;
dbt->data = toku_xmemdup(k, len);
return dbt;
}
DBT *toku_copyref_dbt(DBT *dst, const DBT src) {
dst->flags = 0;
dst->ulen = 0;
dst->size = src.size;
dst->data = src.data;
return dst;
}
DBT *toku_clone_dbt(DBT *dst, const DBT &src) {
return toku_memdup_dbt(dst, src.data, src.size);
}
void toku_sdbt_cleanup(struct simple_dbt *sdbt) {
if (sdbt->data) toku_free(sdbt->data);
memset(sdbt, 0, sizeof(*sdbt));
}
const DBT *toku_dbt_positive_infinity(void) {
static DBT positive_infinity_dbt = {
.data = 0, .size = 0, .ulen = 0, .flags = 0}; // port
return &positive_infinity_dbt;
}
const DBT *toku_dbt_negative_infinity(void) {
static DBT negative_infinity_dbt = {
.data = 0, .size = 0, .ulen = 0, .flags = 0}; // port
return &negative_infinity_dbt;
}
bool toku_dbt_is_infinite(const DBT *dbt) {
return dbt == toku_dbt_positive_infinity() ||
dbt == toku_dbt_negative_infinity();
}
bool toku_dbt_is_empty(const DBT *dbt) {
// can't have a null data field with a non-zero size
paranoid_invariant(dbt->data != nullptr || dbt->size == 0);
return dbt->data == nullptr;
}
int toku_dbt_infinite_compare(const DBT *a, const DBT *b) {
if (a == b) {
return 0;
} else if (a == toku_dbt_positive_infinity()) {
return 1;
} else if (b == toku_dbt_positive_infinity()) {
return -1;
} else if (a == toku_dbt_negative_infinity()) {
return -1;
} else {
invariant(b == toku_dbt_negative_infinity());
return 1;
}
}
bool toku_dbt_equals(const DBT *a, const DBT *b) {
if (!toku_dbt_is_infinite(a) && !toku_dbt_is_infinite(b)) {
return a->data == b->data && a->size == b->size;
} else {
// a or b is infinite, so they're equal if they are the same infinite
return a == b ? true : false;
}
}
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,84 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include "../db.h"
// TODO: John
// Document this API a little better so that DBT
// memory management can be morm widely understood.
DBT *toku_init_dbt(DBT *);
// returns: an initialized but empty dbt (for which toku_dbt_is_empty() is true)
DBT toku_empty_dbt(void);
DBT *toku_init_dbt_flags(DBT *, uint32_t flags);
void toku_destroy_dbt(DBT *);
DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len);
DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len);
DBT *toku_copyref_dbt(DBT *dst, const DBT src);
DBT *toku_clone_dbt(DBT *dst, const DBT &src);
void toku_sdbt_cleanup(struct simple_dbt *sdbt);
// returns: special DBT pointer representing positive infinity
const DBT *toku_dbt_positive_infinity(void);
// returns: special DBT pointer representing negative infinity
const DBT *toku_dbt_negative_infinity(void);
// returns: true if the given dbt is either positive or negative infinity
bool toku_dbt_is_infinite(const DBT *dbt);
// returns: true if the given dbt has no data (ie: dbt->data == nullptr)
bool toku_dbt_is_empty(const DBT *dbt);
// effect: compares two potentially infinity-valued dbts
// requires: at least one is infinite (assert otherwise)
int toku_dbt_infinite_compare(const DBT *a, const DBT *b);
// returns: true if the given dbts have the same data pointer and size
bool toku_dbt_equals(const DBT *a, const DBT *b);

@ -0,0 +1,143 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <memory.h>
//******************************************************************************
//
// Overview: A growable array is a little bit like std::vector except that
// it doesn't have constructors (hence can be used in static constructs, since
// the google style guide says no constructors), and it's a little simpler.
// Operations:
// init and deinit (we don't have constructors and destructors).
// fetch_unchecked to get values out.
// store_unchecked to put values in.
// push to add an element at the end
// get_size to find out the size
// get_memory_size to find out how much memory the data stucture is using.
//
//******************************************************************************
namespace toku {
template <typename T>
class GrowableArray {
public:
void init(void)
// Effect: Initialize the array to contain no elements.
{
m_array = NULL;
m_size = 0;
m_size_limit = 0;
}
void deinit(void)
// Effect: Deinitialize the array (freeing any memory it uses, for example).
{
toku_free(m_array);
m_array = NULL;
m_size = 0;
m_size_limit = 0;
}
T fetch_unchecked(size_t i) const
// Effect: Fetch the ith element. If i is out of range, the system asserts.
{
return m_array[i];
}
void store_unchecked(size_t i, T v)
// Effect: Store v in the ith element. If i is out of range, the system
// asserts.
{
paranoid_invariant(i < m_size);
m_array[i] = v;
}
void push(T v)
// Effect: Add v to the end of the array (increasing the size). The amortized
// cost of this operation is constant. Implementation hint: Double the size
// of the array when it gets too big so that the amortized cost stays
// constant.
{
if (m_size >= m_size_limit) {
if (m_array == NULL) {
m_size_limit = 1;
} else {
m_size_limit *= 2;
}
XREALLOC_N(m_size_limit, m_array);
}
m_array[m_size++] = v;
}
size_t get_size(void) const
// Effect: Return the number of elements in the array.
{
return m_size;
}
size_t memory_size(void) const
// Effect: Return the size (in bytes) that the array occupies in memory. This
// is really only an estimate.
{
return sizeof(*this) + sizeof(T) * m_size_limit;
}
private:
T *m_array;
size_t m_size;
size_t m_size_limit; // How much space is allocated in array.
};
} // namespace toku

@ -0,0 +1,187 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef ROCKSDB_LITE
#ifndef OS_WIN
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "memarena.h"
#include <string.h>
#include <algorithm>
#include "../portability/memory.h"
void memarena::create(size_t initial_size) {
_current_chunk = arena_chunk();
_other_chunks = nullptr;
_size_of_other_chunks = 0;
_footprint_of_other_chunks = 0;
_n_other_chunks = 0;
_current_chunk.size = initial_size;
if (_current_chunk.size > 0) {
XMALLOC_N(_current_chunk.size, _current_chunk.buf);
}
}
void memarena::destroy(void) {
if (_current_chunk.buf) {
toku_free(_current_chunk.buf);
}
for (int i = 0; i < _n_other_chunks; i++) {
toku_free(_other_chunks[i].buf);
}
if (_other_chunks) {
toku_free(_other_chunks);
}
_current_chunk = arena_chunk();
_other_chunks = nullptr;
_n_other_chunks = 0;
}
static size_t round_to_page(size_t size) {
const size_t page_size = 4096;
const size_t r = page_size + ((size - 1) & ~(page_size - 1));
assert((r & (page_size - 1)) == 0); // make sure it's aligned
assert(r >= size); // make sure it's not too small
assert(r <
size + page_size); // make sure we didn't grow by more than a page.
return r;
}
static const size_t MEMARENA_MAX_CHUNK_SIZE = 64 * 1024 * 1024;
void *memarena::malloc_from_arena(size_t size) {
if (_current_chunk.buf == nullptr ||
_current_chunk.size < _current_chunk.used + size) {
// The existing block isn't big enough.
// Add the block to the vector of blocks.
if (_current_chunk.buf) {
invariant(_current_chunk.size > 0);
int old_n = _n_other_chunks;
XREALLOC_N(old_n + 1, _other_chunks);
_other_chunks[old_n] = _current_chunk;
_n_other_chunks = old_n + 1;
_size_of_other_chunks += _current_chunk.size;
_footprint_of_other_chunks +=
toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
}
// Make a new one. Grow the buffer size exponentially until we hit
// the max chunk size, but make it at least `size' bytes so the
// current allocation always fit.
size_t new_size =
std::min(MEMARENA_MAX_CHUNK_SIZE, 2 * _current_chunk.size);
if (new_size < size) {
new_size = size;
}
new_size = round_to_page(
new_size); // at least size, but round to the next page size
XMALLOC_N(new_size, _current_chunk.buf);
_current_chunk.used = 0;
_current_chunk.size = new_size;
}
invariant(_current_chunk.buf != nullptr);
// allocate in the existing block.
char *p = _current_chunk.buf + _current_chunk.used;
_current_chunk.used += size;
return p;
}
void memarena::move_memory(memarena *dest) {
// Move memory to dest
XREALLOC_N(dest->_n_other_chunks + _n_other_chunks + 1, dest->_other_chunks);
dest->_size_of_other_chunks += _size_of_other_chunks + _current_chunk.size;
dest->_footprint_of_other_chunks +=
_footprint_of_other_chunks +
toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
for (int i = 0; i < _n_other_chunks; i++) {
dest->_other_chunks[dest->_n_other_chunks++] = _other_chunks[i];
}
dest->_other_chunks[dest->_n_other_chunks++] = _current_chunk;
// Clear out this memarena's memory
toku_free(_other_chunks);
_current_chunk = arena_chunk();
_other_chunks = nullptr;
_size_of_other_chunks = 0;
_footprint_of_other_chunks = 0;
_n_other_chunks = 0;
}
size_t memarena::total_memory_size(void) const {
return sizeof(*this) + total_size_in_use() +
_n_other_chunks * sizeof(*_other_chunks);
}
size_t memarena::total_size_in_use(void) const {
return _size_of_other_chunks + _current_chunk.used;
}
size_t memarena::total_footprint(void) const {
return sizeof(*this) + _footprint_of_other_chunks +
toku_memory_footprint(_current_chunk.buf, _current_chunk.used) +
_n_other_chunks * sizeof(*_other_chunks);
}
////////////////////////////////////////////////////////////////////////////////
const void *memarena::chunk_iterator::current(size_t *used) const {
if (_chunk_idx < 0) {
*used = _ma->_current_chunk.used;
return _ma->_current_chunk.buf;
} else if (_chunk_idx < _ma->_n_other_chunks) {
*used = _ma->_other_chunks[_chunk_idx].used;
return _ma->_other_chunks[_chunk_idx].buf;
}
*used = 0;
return nullptr;
}
void memarena::chunk_iterator::next() { _chunk_idx++; }
bool memarena::chunk_iterator::more() const {
if (_chunk_idx < 0) {
return _ma->_current_chunk.buf != nullptr;
}
return _chunk_idx < _ma->_n_other_chunks;
}
#endif // OS_WIN
#endif // ROCKSDB_LITE

@ -0,0 +1,127 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <stdlib.h>
/*
* A memarena is used to efficiently store a collection of objects that never
* move The pattern is allocate more and more stuff and free all of the items at
* once. The underlying memory will store 1 or more objects per chunk. Each
* chunk is contiguously laid out in memory but chunks are not necessarily
* contiguous with each other.
*/
class memarena {
public:
memarena()
: _current_chunk(arena_chunk()),
_other_chunks(nullptr),
_n_other_chunks(0),
_size_of_other_chunks(0),
_footprint_of_other_chunks(0) {}
// Effect: Create a memarena with the specified initial size
void create(size_t initial_size);
void destroy(void);
// Effect: Allocate some memory. The returned value remains valid until the
// memarena is cleared or closed.
// In case of ENOMEM, aborts.
void *malloc_from_arena(size_t size);
// Effect: Move all the memory from this memarena into DEST.
// When SOURCE is closed the memory won't be freed.
// When DEST is closed, the memory will be freed, unless DEST moves
// its memory to another memarena...
void move_memory(memarena *dest);
// Effect: Calculate the amount of memory used by a memory arena.
size_t total_memory_size(void) const;
// Effect: Calculate the used space of the memory arena (ie: excludes unused
// space)
size_t total_size_in_use(void) const;
// Effect: Calculate the amount of memory used, according to
// toku_memory_footprint(),
// which is a more expensive but more accurate count of memory used.
size_t total_footprint(void) const;
// iterator over the underlying chunks that store objects in the memarena.
// a chunk is represented by a pointer to const memory and a usable byte
// count.
class chunk_iterator {
public:
chunk_iterator(const memarena *ma) : _ma(ma), _chunk_idx(-1) {}
// returns: base pointer to the current chunk
// *used set to the number of usable bytes
// if more() is false, returns nullptr and *used = 0
const void *current(size_t *used) const;
// requires: more() is true
void next();
bool more() const;
private:
// -1 represents the 'initial' chunk in a memarena, ie: ma->_current_chunk
// >= 0 represents the i'th chunk in the ma->_other_chunks array
const memarena *_ma;
int _chunk_idx;
};
private:
struct arena_chunk {
arena_chunk() : buf(nullptr), used(0), size(0) {}
char *buf;
size_t used;
size_t size;
};
struct arena_chunk _current_chunk;
struct arena_chunk *_other_chunks;
int _n_other_chunks;
size_t _size_of_other_chunks; // the buf_size of all the other chunks.
size_t _footprint_of_other_chunks; // the footprint of all the other chunks.
friend class memarena_unit_test;
};

@ -0,0 +1,793 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <memory.h>
#include <stdint.h>
#include "../portability/toku_portability.h"
#include "../portability/toku_race_tools.h"
#include "growable_array.h"
namespace toku {
/**
* Order Maintenance Tree (OMT)
*
* Maintains a collection of totally ordered values, where each value has an
* integer weight. The OMT is a mutable datatype.
*
* The Abstraction:
*
* An OMT is a vector of values, $V$, where $|V|$ is the length of the vector.
* The vector is numbered from $0$ to $|V|-1$.
* Each value has a weight. The weight of the $i$th element is denoted
* $w(V_i)$.
*
* We can create a new OMT, which is the empty vector.
*
* We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
* $|V'|=1+|V|$ and
*
* V'_j = V_j if $j<i$
* x if $j=i$
* V_{j-1} if $j>i$.
*
* We can specify $i$ using a kind of function instead of as an integer.
* Let $b$ be a function mapping from values to nonzero integers, such that
* the signum of $b$ is monotically increasing.
* We can specify $i$ as the minimum integer such that $b(V_i)>0$.
*
* We look up a value using its index, or using a Heaviside function.
* For lookups, we allow $b$ to be zero for some values, and again the signum of
* $b$ must be monotonically increasing. When lookup up values, we can look up
* $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$. (With a
* special return code if no such value exists.) (Rationale: Ordinarily we want
* $i$ to be unique. But for various reasons we want to allow multiple zeros,
* and we want the smallest $i$ in that case.) $V_i$ where $i$ is the minimum
* integer such that $b(V_i)>0$. (Or an indication that no such value exists.)
* $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$. (Or an
* indication that no such value exists.)
*
* When looking up a value using a Heaviside function, we get the value and its
* index.
*
* We can also split an OMT into two OMTs, splitting the weight of the values
* evenly. Find a value $j$ such that the values to the left of $j$ have about
* the same total weight as the values to the right of $j$. The resulting two
* OMTs contain the values to the left of $j$ and the values to the right of $j$
* respectively. All of the values from the original OMT go into one of the new
* OMTs. If the weights of the values don't split exactly evenly, then the
* implementation has the freedom to choose whether the new left OMT or the new
* right OMT is larger.
*
* Performance:
* Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$
* calls to the Heaviside function. The memory required is O(|V|).
*
* Usage:
* The omt is templated by two parameters:
* - omtdata_t is what will be stored within the omt. These could be pointers
* or real data types (ints, structs).
* - omtdataout_t is what will be returned by find and related functions. By
* default, it is the same as omtdata_t, but you can set it to (omtdata_t *). To
* create an omt which will store "TXNID"s, for example, it is a good idea to
* typedef the template: typedef omt<TXNID> txnid_omt_t; If you are storing
* structs, you may want to be able to get a pointer to the data actually stored
* in the omt (see find_zero). To do this, use the second template parameter:
* typedef omt<struct foo, struct foo *> foo_omt_t;
*/
namespace omt_internal {
template <bool subtree_supports_marks>
class subtree_templated {
private:
uint32_t m_index;
public:
static const uint32_t NODE_NULL = UINT32_MAX;
inline void set_to_null(void) { m_index = NODE_NULL; }
inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
inline uint32_t get_index(void) const { return m_index; }
inline void set_index(uint32_t index) {
paranoid_invariant(index != NODE_NULL);
m_index = index;
}
} __attribute__((__packed__, aligned(4)));
template <>
class subtree_templated<true> {
private:
uint32_t m_bitfield;
static const uint32_t MASK_INDEX = ~(((uint32_t)1) << 31);
static const uint32_t MASK_BIT = ((uint32_t)1) << 31;
inline void set_index_internal(uint32_t new_index) {
m_bitfield = (m_bitfield & MASK_BIT) | new_index;
}
public:
static const uint32_t NODE_NULL = INT32_MAX;
inline void set_to_null(void) { this->set_index_internal(NODE_NULL); }
inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
inline uint32_t get_index(void) const {
TOKU_DRD_IGNORE_VAR(m_bitfield);
const uint32_t bits = m_bitfield;
TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
return bits & MASK_INDEX;
}
inline void set_index(uint32_t index) {
paranoid_invariant(index < NODE_NULL);
this->set_index_internal(index);
}
inline bool get_bit(void) const {
TOKU_DRD_IGNORE_VAR(m_bitfield);
const uint32_t bits = m_bitfield;
TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
return (bits & MASK_BIT) != 0;
}
inline void enable_bit(void) {
// These bits may be set by a thread with a write lock on some
// leaf, and the index can be read by another thread with a (read
// or write) lock on another thread. Also, the has_marks_below
// bit can be set by two threads simultaneously. Neither of these
// are real races, so if we are using DRD we should tell it to
// ignore these bits just while we set this bit. If there were a
// race in setting the index, that would be a real race.
TOKU_DRD_IGNORE_VAR(m_bitfield);
m_bitfield |= MASK_BIT;
TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
}
inline void disable_bit(void) { m_bitfield &= MASK_INDEX; }
} __attribute__((__packed__));
template <typename omtdata_t, bool subtree_supports_marks>
class omt_node_templated {
public:
omtdata_t value;
uint32_t weight;
subtree_templated<subtree_supports_marks> left;
subtree_templated<subtree_supports_marks> right;
// this needs to be in both implementations because we don't have
// a "static if" the caller can use
inline void clear_stolen_bits(void) {}
}; // note: originally this class had __attribute__((__packed__, aligned(4)))
template <typename omtdata_t>
class omt_node_templated<omtdata_t, true> {
public:
omtdata_t value;
uint32_t weight;
subtree_templated<true> left;
subtree_templated<true> right;
inline bool get_marked(void) const { return left.get_bit(); }
inline void set_marked_bit(void) { return left.enable_bit(); }
inline void unset_marked_bit(void) { return left.disable_bit(); }
inline bool get_marks_below(void) const { return right.get_bit(); }
inline void set_marks_below_bit(void) {
// This function can be called by multiple threads.
// Checking first reduces cache invalidation.
if (!this->get_marks_below()) {
right.enable_bit();
}
}
inline void unset_marks_below_bit(void) { right.disable_bit(); }
inline void clear_stolen_bits(void) {
this->unset_marked_bit();
this->unset_marks_below_bit();
}
}; // note: originally this class had __attribute__((__packed__, aligned(4)))
} // namespace omt_internal
template <typename omtdata_t, typename omtdataout_t = omtdata_t,
bool supports_marks = false>
class omt {
public:
/**
* Effect: Create an empty OMT.
* Performance: constant time.
*/
void create(void);
/**
* Effect: Create an empty OMT with no internal allocated space.
* Performance: constant time.
* Rationale: In some cases we need a valid omt but don't want to malloc.
*/
void create_no_array(void);
/**
* Effect: Create a OMT containing values. The number of values is in
* numvalues. Stores the new OMT in *omtp. Requires: this has not been created
* yet Requires: values != NULL Requires: values is sorted Performance:
* time=O(numvalues) Rationale: Normally to insert N values takes O(N lg N)
* amortized time. If the N values are known in advance, are sorted, and the
* structure is empty, we can batch insert them much faster.
*/
__attribute__((nonnull)) void create_from_sorted_array(
const omtdata_t *const values, const uint32_t numvalues);
/**
* Effect: Create an OMT containing values. The number of values is in
* numvalues. On success the OMT takes ownership of *values array, and sets
* values=NULL. Requires: this has not been created yet Requires: values !=
* NULL Requires: *values is sorted Requires: *values was allocated with
* toku_malloc Requires: Capacity of the *values array is <= new_capacity
* Requires: On success, *values may not be accessed again by the caller.
* Performance: time=O(1)
* Rational: create_from_sorted_array takes O(numvalues) time.
* By taking ownership of the array, we save a malloc and
* memcpy, and possibly a free (if the caller is done with the array).
*/
void create_steal_sorted_array(omtdata_t **const values,
const uint32_t numvalues,
const uint32_t new_capacity);
/**
* Effect: Create a new OMT, storing it in *newomt.
* The values to the right of index (starting at index) are moved to *newomt.
* Requires: newomt != NULL
* Returns
* 0 success,
* EINVAL if index > toku_omt_size(omt)
* On nonzero return, omt and *newomt are unmodified.
* Performance: time=O(n)
* Rationale: We don't need a split-evenly operation. We need to split items
* so that their total sizes are even, and other similar splitting criteria.
* It's easy to split evenly by calling size(), and dividing by two.
*/
__attribute__((nonnull)) int split_at(omt *const newomt, const uint32_t idx);
/**
* Effect: Appends leftomt and rightomt to produce a new omt.
* Creates this as the new omt.
* leftomt and rightomt are destroyed.
* Performance: time=O(n) is acceptable, but one can imagine implementations
* that are O(\log n) worst-case.
*/
__attribute__((nonnull)) void merge(omt *const leftomt, omt *const rightomt);
/**
* Effect: Creates a copy of an omt.
* Creates this as the clone.
* Each element is copied directly. If they are pointers, the underlying
* data is not duplicated. Performance: O(n) or the running time of
* fill_array_with_subtree_values()
*/
void clone(const omt &src);
/**
* Effect: Set the tree to be empty.
* Note: Will not reallocate or resize any memory.
* Performance: time=O(1)
*/
void clear(void);
/**
* Effect: Destroy an OMT, freeing all its memory.
* If the values being stored are pointers, their underlying data is not
* freed. See free_items() Those values may be freed before or after calling
* toku_omt_destroy. Rationale: Returns no values since free() cannot fail.
* Rationale: Does not free the underlying pointers to reduce complexity.
* Performance: time=O(1)
*/
void destroy(void);
/**
* Effect: return |this|.
* Performance: time=O(1)
*/
uint32_t size(void) const;
/**
* Effect: Insert value into the OMT.
* If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
* Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
* If no such i exists, then let i be |V|
* Then this has the same effect as
* insert_at(tree, value, i);
* If idx!=NULL then i is stored in *idx
* Requires: The signum of h must be monotonically increasing.
* Returns:
* 0 success
* DB_KEYEXIST the key is present (h was equal to zero for some value)
* On nonzero return, omt is unchanged.
* Performance: time=O(\log N) amortized.
* Rationale: Some future implementation may be O(\log N) worst-case time, but
* O(\log N) amortized is good enough for now.
*/
template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
int insert(const omtdata_t &value, const omtcmp_t &v, uint32_t *const idx);
/**
* Effect: Increases indexes of all items at slot >= idx by 1.
* Insert value into the position at idx.
* Returns:
* 0 success
* EINVAL if idx > this->size()
* On error, omt is unchanged.
* Performance: time=O(\log N) amortized time.
* Rationale: Some future implementation may be O(\log N) worst-case time, but
* O(\log N) amortized is good enough for now.
*/
int insert_at(const omtdata_t &value, const uint32_t idx);
/**
* Effect: Replaces the item at idx with value.
* Returns:
* 0 success
* EINVAL if idx>=this->size()
* On error, omt is unchanged.
* Performance: time=O(\log N)
* Rationale: The FT needs to be able to replace a value with another copy of
* the same value (allocated in a different location)
*
*/
int set_at(const omtdata_t &value, const uint32_t idx);
/**
* Effect: Delete the item in slot idx.
* Decreases indexes of all items at slot > idx by 1.
* Returns
* 0 success
* EINVAL if idx>=this->size()
* On error, omt is unchanged.
* Rationale: To delete an item, first find its index using find or find_zero,
* then delete it. Performance: time=O(\log N) amortized.
*/
int delete_at(const uint32_t idx);
/**
* Effect: Iterate over the values of the omt, from left to right, calling f
* on each value. The first argument passed to f is a ref-to-const of the
* value stored in the omt. The second argument passed to f is the index of
* the value. The third argument passed to f is iterate_extra. The indices run
* from 0 (inclusive) to this->size() (exclusive). Requires: f != NULL
* Returns:
* If f ever returns nonzero, then the iteration stops, and the value
* returned by f is returned by iterate. If f always returns zero, then
* iterate returns 0. Requires: Don't modify the omt while running. (E.g., f
* may not insert or delete values from the omt.) Performance: time=O(i+\log
* N) where i is the number of times f is called, and N is the number of
* elements in the omt. Rationale: Although the functional iterator requires
* defining another function (as opposed to C++ style iterator), it is much
* easier to read. Rationale: We may at some point use functors, but for now
* this is a smaller change from the old OMT.
*/
template <typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
int iterate(iterate_extra_t *const iterate_extra) const;
/**
* Effect: Iterate over the values of the omt, from left to right, calling f
* on each value. The first argument passed to f is a ref-to-const of the
* value stored in the omt. The second argument passed to f is the index of
* the value. The third argument passed to f is iterate_extra. The indices run
* from 0 (inclusive) to this->size() (exclusive). We will iterate only over
* [left,right)
*
* Requires: left <= right
* Requires: f != NULL
* Returns:
* EINVAL if right > this->size()
* If f ever returns nonzero, then the iteration stops, and the value
* returned by f is returned by iterate_on_range. If f always returns zero,
* then iterate_on_range returns 0. Requires: Don't modify the omt while
* running. (E.g., f may not insert or delete values from the omt.)
* Performance: time=O(i+\log N) where i is the number of times f is called,
* and N is the number of elements in the omt. Rational: Although the
* functional iterator requires defining another function (as opposed to C++
* style iterator), it is much easier to read.
*/
template <typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
int iterate_on_range(const uint32_t left, const uint32_t right,
iterate_extra_t *const iterate_extra) const;
/**
* Effect: Iterate over the values of the omt, and mark the nodes that are
* visited. Other than the marks, this behaves the same as iterate_on_range.
* Requires: supports_marks == true
* Performance: time=O(i+\log N) where i is the number of times f is called,
* and N is the number of elements in the omt. Notes: This function MAY be
* called concurrently by multiple threads, but not concurrently with any
* other non-const function.
*/
template <typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
int iterate_and_mark_range(const uint32_t left, const uint32_t right,
iterate_extra_t *const iterate_extra);
/**
* Effect: Iterate over the values of the omt, from left to right, calling f
* on each value whose node has been marked. Other than the marks, this
* behaves the same as iterate. Requires: supports_marks == true Performance:
* time=O(i+\log N) where i is the number of times f is called, and N is the
* number of elements in the omt.
*/
template <typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
int iterate_over_marked(iterate_extra_t *const iterate_extra) const;
/**
* Effect: Delete all elements from the omt, whose nodes have been marked.
* Requires: supports_marks == true
* Performance: time=O(N + i\log N) where i is the number of marked elements,
* {c,sh}ould be faster
*/
void delete_all_marked(void);
/**
* Effect: Verify that the internal state of the marks in the tree are
* self-consistent. Crashes the system if the marks are in a bad state.
* Requires: supports_marks == true
* Performance: time=O(N)
* Notes:
* Even though this is a const function, it requires exclusive access.
* Rationale:
* The current implementation of the marks relies on a sort of
* "cache" bit representing the state of bits below it in the tree.
* This allows glass-box testing that these bits are correct.
*/
void verify_marks_consistent(void) const;
/**
* Effect: None
* Returns whether there are any marks in the tree.
*/
bool has_marks(void) const;
/**
* Effect: Iterate over the values of the omt, from left to right, calling f
* on each value. The first argument passed to f is a pointer to the value
* stored in the omt. The second argument passed to f is the index of the
* value. The third argument passed to f is iterate_extra. The indices run
* from 0 (inclusive) to this->size() (exclusive). Requires: same as for
* iterate() Returns: same as for iterate() Performance: same as for iterate()
* Rationale: In general, most iterators should use iterate() since they
* should not modify the data stored in the omt. This function is for
* iterators which need to modify values (for example, free_items). Rationale:
* We assume if you are transforming the data in place, you want to do it to
* everything at once, so there is not yet an iterate_on_range_ptr (but there
* could be).
*/
template <typename iterate_extra_t,
int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
void iterate_ptr(iterate_extra_t *const iterate_extra);
/**
* Effect: Set *value=V_idx
* Returns
* 0 success
* EINVAL if index>=toku_omt_size(omt)
* On nonzero return, *value is unchanged
* Performance: time=O(\log N)
*/
int fetch(const uint32_t idx, omtdataout_t *const value) const;
/**
* Effect: Find the smallest i such that h(V_i, extra)>=0
* If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value =
* V_i, and return 0. If there is such an i and h(V_i,extra)>0 then set
* *idxp=i and return DB_NOTFOUND. If there is no such i then set
* *idx=this->size() and return DB_NOTFOUND. Note: value is of type
* omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is
* fixed by the instantiation. If it is the value type, then the value is
* copied out (even if the value type is a pointer to something else) If it is
* the pointer type, then *value is set to a pointer to the data within the
* omt. This is determined by the type of the omt as initially declared. If
* the omt is declared as omt<foo_t>, then foo_t's will be stored and foo_t's
* will be returned by find and related functions. If the omt is declared as
* omt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the
* stored items will be returned by find and related functions. Rationale:
* Structs too small for malloc should be stored directly in the omt.
* These structs may need to be edited as they exist inside the omt, so we
* need a way to get a pointer within the omt. Using separate functions for
* returning pointers and values increases code duplication and reduces
* type-checking. That also reduces the ability of the creator of a data
* structure to give advice to its future users. Slight overloading in this
* case seemed to provide a better API and better type checking.
*/
template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
int find_zero(const omtcmp_t &extra, omtdataout_t *const value,
uint32_t *const idxp) const;
/**
* Effect:
* If direction >0 then find the smallest i such that h(V_i,extra)>0.
* If direction <0 then find the largest i such that h(V_i,extra)<0.
* (Direction may not be equal to zero.)
* If value!=NULL then store V_i in *value
* If idxp!=NULL then store i in *idxp.
* Requires: The signum of h is monotically increasing.
* Returns
* 0 success
* DB_NOTFOUND no such value is found.
* On nonzero return, *value and *idxp are unchanged
* Performance: time=O(\log N)
* Rationale:
* Here's how to use the find function to find various things
* Cases for find:
* find first value: ( h(v)=+1, direction=+1 )
* find last value ( h(v)=-1, direction=-1 )
* find first X ( h(v)=(v< x) ? -1 : 1 direction=+1 )
* find last X ( h(v)=(v<=x) ? -1 : 1 direction=-1 )
* find X or successor to X ( same as find first X. )
*
* Rationale: To help understand heaviside functions and behavor of find:
* There are 7 kinds of heaviside functions.
* The signus of the h must be monotonically increasing.
* Given a function of the following form, A is the element
* returned for direction>0, B is the element returned
* for direction<0, C is the element returned for
* direction==0 (see find_zero) (with a return of 0), and D is the element
* returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
* If any of A, B, or C are not found, then asking for the
* associated direction will return DB_NOTFOUND.
* See find_zero for more information.
*
* Let the following represent the signus of the heaviside function.
*
* -...-
* A
* D
*
* +...+
* B
* D
*
* 0...0
* C
*
* -...-0...0
* AC
*
* 0...0+...+
* C B
*
* -...-+...+
* AB
* D
*
* -...-0...0+...+
* AC B
*/
template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
int find(const omtcmp_t &extra, int direction, omtdataout_t *const value,
uint32_t *const idxp) const;
/**
* Effect: Return the size (in bytes) of the omt, as it resides in main
* memory. If the data stored are pointers, don't include the size of what
* they all point to.
*/
size_t memory_size(void);
private:
typedef uint32_t node_idx;
typedef omt_internal::subtree_templated<supports_marks> subtree;
typedef omt_internal::omt_node_templated<omtdata_t, supports_marks> omt_node;
ENSURE_POD(subtree);
struct omt_array {
uint32_t start_idx;
uint32_t num_values;
omtdata_t *values;
};
struct omt_tree {
subtree root;
uint32_t free_idx;
omt_node *nodes;
};
bool is_array;
uint32_t capacity;
union {
struct omt_array a;
struct omt_tree t;
} d;
__attribute__((nonnull)) void unmark(const subtree &subtree,
const uint32_t index,
GrowableArray<node_idx> *const indexes);
void create_internal_no_array(const uint32_t new_capacity);
void create_internal(const uint32_t new_capacity);
uint32_t nweight(const subtree &subtree) const;
node_idx node_malloc(void);
void node_free(const node_idx idx);
void maybe_resize_array(const uint32_t n);
__attribute__((nonnull)) void fill_array_with_subtree_values(
omtdata_t *const array, const subtree &subtree) const;
void convert_to_array(void);
__attribute__((nonnull)) void rebuild_from_sorted_array(
subtree *const subtree, const omtdata_t *const values,
const uint32_t numvalues);
void convert_to_tree(void);
void maybe_resize_or_convert(const uint32_t n);
bool will_need_rebalance(const subtree &subtree, const int leftmod,
const int rightmod) const;
__attribute__((nonnull)) void insert_internal(
subtree *const subtreep, const omtdata_t &value, const uint32_t idx,
subtree **const rebalance_subtree);
void set_at_internal_array(const omtdata_t &value, const uint32_t idx);
void set_at_internal(const subtree &subtree, const omtdata_t &value,
const uint32_t idx);
void delete_internal(subtree *const subtreep, const uint32_t idx,
omt_node *const copyn,
subtree **const rebalance_subtree);
template <typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
int iterate_internal_array(const uint32_t left, const uint32_t right,
iterate_extra_t *const iterate_extra) const;
template <typename iterate_extra_t,
int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
void iterate_ptr_internal(const uint32_t left, const uint32_t right,
const subtree &subtree, const uint32_t idx,
iterate_extra_t *const iterate_extra);
template <typename iterate_extra_t,
int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
void iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
iterate_extra_t *const iterate_extra);
template <typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
int iterate_internal(const uint32_t left, const uint32_t right,
const subtree &subtree, const uint32_t idx,
iterate_extra_t *const iterate_extra) const;
template <typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
int iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
const subtree &subtree,
const uint32_t idx,
iterate_extra_t *const iterate_extra);
template <typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
int iterate_over_marked_internal(const subtree &subtree, const uint32_t idx,
iterate_extra_t *const iterate_extra) const;
uint32_t verify_marks_consistent_internal(const subtree &subtree,
const bool allow_marks) const;
void fetch_internal_array(const uint32_t i, omtdataout_t *const value) const;
void fetch_internal(const subtree &subtree, const uint32_t i,
omtdataout_t *const value) const;
__attribute__((nonnull)) void fill_array_with_subtree_idxs(
node_idx *const array, const subtree &subtree) const;
__attribute__((nonnull)) void rebuild_subtree_from_idxs(
subtree *const subtree, const node_idx *const idxs,
const uint32_t numvalues);
__attribute__((nonnull)) void rebalance(subtree *const subtree);
__attribute__((nonnull)) static void copyout(omtdata_t *const out,
const omt_node *const n);
__attribute__((nonnull)) static void copyout(omtdata_t **const out,
omt_node *const n);
__attribute__((nonnull)) static void copyout(
omtdata_t *const out, const omtdata_t *const stored_value_ptr);
__attribute__((nonnull)) static void copyout(
omtdata_t **const out, omtdata_t *const stored_value_ptr);
template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
int find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value,
uint32_t *const idxp) const;
template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
int find_internal_zero(const subtree &subtree, const omtcmp_t &extra,
omtdataout_t *const value, uint32_t *const idxp) const;
template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
int find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value,
uint32_t *const idxp) const;
template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
int find_internal_plus(const subtree &subtree, const omtcmp_t &extra,
omtdataout_t *const value, uint32_t *const idxp) const;
template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
int find_internal_minus_array(const omtcmp_t &extra,
omtdataout_t *const value,
uint32_t *const idxp) const;
template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
int find_internal_minus(const subtree &subtree, const omtcmp_t &extra,
omtdataout_t *const value,
uint32_t *const idxp) const;
};
} // namespace toku
// include the implementation here
#include "omt_impl.h"

@ -0,0 +1,151 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
// Overview: A partitioned_counter provides a counter that can be incremented
// and the running sum can be read at any time.
// We assume that increments are frequent, whereas reading is infrequent.
// Implementation hint: Use thread-local storage so each thread increments its
// own data. The increment does not require a lock or atomic operation.
// Reading the data can be performed by iterating over the thread-local
// versions, summing them up. The data structure also includes a sum for all
// the threads that have died. Use a pthread_key to create the thread-local
// versions. When a thread finishes, the system calls pthread_key destructor
// which can add that thread's copy into the sum_of_dead counter.
// Rationale: For statistics such as are found in engine status, we need a
// counter that requires no cache misses to increment. We've seen significant
// performance speedups by removing certain counters. Rather than removing
// those statistics, we would like to just make the counter fast. We generally
// increment the counters frequently, and want to fetch the values
// infrequently. The counters are monotonic. The counters can be split into
// many counters, which can be summed up at the end. We don't care if we get
// slightly out-of-date counter sums when we read the counter. We don't care
// if there is a race on reading the a counter
// variable and incrementing.
// See tests/test_partitioned_counter.c for some performance measurements.
// Operations:
// create_partitioned_counter Create a counter initialized to zero.
// destroy_partitioned_counter Destroy it.
// increment_partitioned_counter Increment it. This is the frequent
// operation. read_partitioned_counter Get the current value. This is
// infrequent.
// See partitioned_counter.cc for the abstraction function and representation
// invariant.
//
// The google style guide says to avoid using constructors, and it appears that
// constructors may have broken all the tests, because they called
// pthread_key_create before the key was actually created. So the google style
// guide may have some wisdom there...
//
// This version does not use constructors, essentially reverrting to the google
// C++ style guide.
//
// The old C interface. This required a bunch of explicit
// ___attribute__((__destructor__)) functions to remember to destroy counters at
// the end.
#if defined(__cplusplus)
extern "C" {
#endif
typedef struct partitioned_counter *PARTITIONED_COUNTER;
PARTITIONED_COUNTER create_partitioned_counter(void);
// Effect: Create a counter, initialized to zero.
void destroy_partitioned_counter(PARTITIONED_COUNTER);
// Effect: Destroy the counter. No operations on that counter are permitted
// after this.
void increment_partitioned_counter(PARTITIONED_COUNTER, uint64_t amount);
// Effect: Increment the counter by amount.
// Requires: No overflows. This is a 64-bit unsigned counter.
uint64_t read_partitioned_counter(PARTITIONED_COUNTER)
__attribute__((__visibility__("default")));
// Effect: Return the current value of the counter.
void partitioned_counters_init(void);
// Effect: Initialize any partitioned counters data structures that must be set
// up before any partitioned counters run.
void partitioned_counters_destroy(void);
// Effect: Destroy any partitioned counters data structures.
#if defined(__cplusplus)
};
#endif
#if 0
#include <pthread.h>
#include "fttypes.h"
// Used inside the PARTITIONED_COUNTER.
struct linked_list_head {
struct linked_list_element *first;
};
class PARTITIONED_COUNTER {
public:
PARTITIONED_COUNTER(void);
// Effect: Construct a counter, initialized to zero.
~PARTITIONED_COUNTER(void);
// Effect: Destruct the counter.
void increment(uint64_t amount);
// Effect: Increment the counter by amount. This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64).
// Requires: Don't use this from a static constructor or destructor.
uint64_t read(void);
// Effect: Read the sum.
// Requires: Don't use this from a static constructor or destructor.
private:
uint64_t _sum_of_dead; // The sum of all thread-local counts from threads that have terminated.
pthread_key_t _key; // The pthread_key which gives us the hook to construct and destruct thread-local storage.
struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter.
// This function is used to destroy the thread-local part of the state when a thread terminates.
// But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends.
friend void destroy_thread_local_part_of_partitioned_counters (void *);
};
#endif

@ -0,0 +1,62 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident \
"Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include "partitioned_counter.h"
// PORT2: #include <util/constexpr.h>
#define TOKUFT_STATUS_INIT(array, k, c, t, l, inc) \
do { \
array.status[k].keyname = #k; \
array.status[k].columnname = #c; \
array.status[k].type = t; \
array.status[k].legend = l; \
constexpr_static_assert( \
strcmp(#c, "NULL") && strcmp(#c, "0"), \
"Use nullptr for no column name instead of NULL, 0, etc..."); \
constexpr_static_assert( \
(inc) == TOKU_ENGINE_STATUS || strcmp(#c, "nullptr"), \
"Missing column name."); \
array.status[k].include = \
static_cast<toku_engine_status_include_type>(inc); \
if (t == STATUS_PARCOUNT) { \
array.status[k].value.parcount = create_partitioned_counter(); \
} \
} while (0)
Loading…
Cancel
Save