Spaces:

purkrmir
/

BBoxMaskPose-demo

Running on Zero

App Files Files Community

Miroslav Purkrabek commited on Aug 15

Commit

a249588

1 Parent(s): 4b8d5c5

add code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

CITATION.cff +25 -0
LICENSE +674 -0
app.py +262 -0
configs/README.md +30 -0
configs/bmp_D3.yaml +37 -0
configs/bmp_J1.yaml +39 -0
demo/bmp_demo.py +250 -0
demo/demo_utils.py +705 -0
demo/mm_utils.py +106 -0
demo/posevis_lite.py +507 -0
demo/sam2_utils.py +714 -0
mmpose/__init__.py +27 -0
mmpose/apis/__init__.py +16 -0
mmpose/apis/inference.py +280 -0
mmpose/apis/inference_3d.py +360 -0
mmpose/apis/inference_tracking.py +103 -0
mmpose/apis/inferencers/__init__.py +11 -0
mmpose/apis/inferencers/base_mmpose_inferencer.py +691 -0
mmpose/apis/inferencers/hand3d_inferencer.py +344 -0
mmpose/apis/inferencers/mmpose_inferencer.py +250 -0
mmpose/apis/inferencers/pose2d_inferencer.py +262 -0
mmpose/apis/inferencers/pose3d_inferencer.py +457 -0
mmpose/apis/inferencers/utils/__init__.py +5 -0
mmpose/apis/inferencers/utils/default_det_models.py +36 -0
mmpose/apis/inferencers/utils/get_model_alias.py +37 -0
mmpose/apis/visualization.py +132 -0
mmpose/codecs/__init__.py +25 -0
mmpose/codecs/annotation_processors.py +100 -0
mmpose/codecs/associative_embedding.py +522 -0
mmpose/codecs/base.py +81 -0
mmpose/codecs/decoupled_heatmap.py +274 -0
mmpose/codecs/edpose_label.py +153 -0
mmpose/codecs/hand_3d_heatmap.py +202 -0
mmpose/codecs/image_pose_lifting.py +280 -0
mmpose/codecs/integral_regression_label.py +121 -0
mmpose/codecs/megvii_heatmap.py +147 -0
mmpose/codecs/motionbert_label.py +240 -0
mmpose/codecs/msra_heatmap.py +153 -0
mmpose/codecs/onehot_heatmap.py +263 -0
mmpose/codecs/regression_label.py +108 -0
mmpose/codecs/simcc_label.py +311 -0
mmpose/codecs/spr.py +306 -0
mmpose/codecs/udp_heatmap.py +263 -0
mmpose/codecs/utils/__init__.py +32 -0
mmpose/codecs/utils/camera_image_projection.py +102 -0
mmpose/codecs/utils/gaussian_heatmap.py +433 -0
mmpose/codecs/utils/instance_property.py +111 -0
mmpose/codecs/utils/offset_heatmap.py +143 -0
mmpose/codecs/utils/oks_map.py +97 -0
mmpose/codecs/utils/post_processing.py +530 -0

CITATION.cff ADDED Viewed

	@@ -0,0 +1,25 @@

+# CITATION.cff file for Detection, Pose Estimation and Segmentation for Multiple Bodies: Closing the Virtuous Circle
+# This file provides metadata for the software and its preferred citation format.
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: Purkrabek
+  given-names: Miroslav
+- family-names: Matas
+  given-names: Jiri
+title: "Detection, Pose Estimation and Segmentation for Multiple Bodies: Closing the Virtuous Circle"
+version: 1.0.0
+date-released: 2025-06-20
+preferred-citation:
+  type: conference-paper
+  authors:
+  - family-names: Purkrabek
+    given-names: Miroslav
+  - family-names: Matas
+    given-names: Jiri
+  collection-title: "Proceedings of the IEEE/CVF International Conference on Computer Vision"
+  month: 10
+  start: 1 # First page number
+  end: 8 # Last page number
+  title: "Detection, Pose Estimation and Segmentation for Multiple Bodies: Closing the Virtuous Circle"
+  year: 2025

LICENSE ADDED Viewed

	@@ -0,0 +1,674 @@

+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Use with the GNU Affero General Public License.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.

app.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import gradio as gr
+import spaces
+from pathlib import Path
+import numpy as np
+import yaml
+from demo.demo_utils import DotDict, concat_instances, filter_instances, pose_nms, visualize_demo
+from demo.mm_utils import run_MMDetector, run_MMPose
+from mmdet.apis import init_detector
+from demo.sam2_utils import prepare_model as prepare_sam2_model
+from demo.sam2_utils import process_image_with_SAM
+from mmpose.apis import init_model as init_pose_estimator
+from mmpose.utils import adapt_mmdet_pipeline
+# Default thresholds
+DEFAULT_CAT_ID: int = 0
+DEFAULT_BBOX_THR: float = 0.3
+DEFAULT_NMS_THR: float = 0.3
+DEFAULT_KPT_THR: float = 0.3
+# Global models variable
+det_model = None
+pose_model = None
+sam2_model = None
+def _parse_yaml_config(yaml_path: Path) -> DotDict:
+    """
+    Load BMP configuration from a YAML file.
+    Args:
+        yaml_path (Path): Path to YAML config.
+    Returns:
+        DotDict: Nested config dictionary.
+    """
+    with open(yaml_path, "r") as f:
+        cfg = yaml.safe_load(f)
+    return DotDict(cfg)
+def load_models(bmp_config):
+    device = 'cuda:0'
+    global det_model, pose_model, sam2_model
+    # build detectors
+    det_model = init_detector(bmp_config.detector.det_config, bmp_config.detector.det_checkpoint, device='cpu') # Detect with CPU because of installation issues on HF
+    det_model.cfg = adapt_mmdet_pipeline(det_model.cfg)
+    # build pose estimator
+    pose_model = init_pose_estimator(
+        bmp_config.pose_estimator.pose_config,
+        bmp_config.pose_estimator.pose_checkpoint,
+        device=device,
+        cfg_options=dict(model=dict(test_cfg=dict(output_heatmaps=False))),
+    )
+    sam2_model = prepare_sam2_model(
+        model_cfg=bmp_config.sam2.sam2_config,
+        model_checkpoint=bmp_config.sam2.sam2_checkpoint,
+    )
+    return det_model, pose_model, sam2_model
+@spaces.GPU(duration=60)
+def process_image_with_BMP(
+    img: np.ndarray
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Run the full BMP pipeline on a single image: detection, pose, SAM mask refinement, and visualization.
+    Args:
+        args (Namespace): Parsed CLI arguments.
+        bmp_config (DotDict): Configuration parameters.
+        img_path (Path): Path to the input image.
+        detector: Primary MMDetection model.
+        detector_prime: Secondary MMDetection model for iterations.
+        pose_estimator: MMPose model for keypoint estimation.
+        sam2_model: SAM model for mask refinement.
+    Returns:
+        InstanceData: Final merged detections and refined masks.
+    """
+    bmp_config = _parse_yaml_config(Path("configs/bmp_D3.yaml"))
+    load_models(bmp_config)
+    # img: RGB -> BGR
+    img = img[..., ::-1]
+    img_for_detection = img.copy()
+    rtmdet_result = None
+    all_detections = None
+    for iteration in range(bmp_config.num_bmp_iters):
+        # Step 1: Detection
+        det_instances = run_MMDetector(
+            det_model,
+            img_for_detection,
+            det_cat_id=DEFAULT_CAT_ID,
+            bbox_thr=DEFAULT_BBOX_THR,
+            nms_thr=DEFAULT_NMS_THR,
+        )
+        if len(det_instances.bboxes) == 0:
+            continue
+        # Step 2: Pose estimation
+        pose_instances = run_MMPose(
+            pose_model,
+            img.copy(),
+            detections=det_instances,
+            kpt_thr=DEFAULT_KPT_THR,
+        )
+        # Restrict to first 17 COCO keypoints
+        pose_instances.keypoints = pose_instances.keypoints[:, :17, :]
+        pose_instances.keypoint_scores = pose_instances.keypoint_scores[:, :17]
+        pose_instances.keypoints = np.concatenate(
+            [pose_instances.keypoints, pose_instances.keypoint_scores[:, :, None]], axis=-1
+        )
+        # Step 3: Pose-NMS and SAM refinement
+        all_keypoints = (
+            pose_instances.keypoints
+            if all_detections is None
+            else np.concatenate([all_detections.keypoints, pose_instances.keypoints], axis=0)
+        )
+        all_bboxes = (
+            pose_instances.bboxes
+            if all_detections is None
+            else np.concatenate([all_detections.bboxes, pose_instances.bboxes], axis=0)
+        )
+        num_valid_kpts = np.sum(all_keypoints[:, :, 2] > bmp_config.sam2.prompting.confidence_thr, axis=1)
+        keep_indices = pose_nms(
+            DotDict({"confidence_thr": bmp_config.sam2.prompting.confidence_thr, "oks_thr": bmp_config.oks_nms_thr}),
+            image_kpts=all_keypoints,
+            image_bboxes=all_bboxes,
+            num_valid_kpts=num_valid_kpts,
+        )
+        keep_indices = sorted(keep_indices)  # Sort by original index
+        num_old_detections = 0 if all_detections is None else len(all_detections.bboxes)
+        keep_new_indices = [i - num_old_detections for i in keep_indices if i >= num_old_detections]
+        keep_old_indices = [i for i in keep_indices if i < num_old_detections]
+        if len(keep_new_indices) == 0:
+            continue
+        # filter new detections and compute scores
+        new_dets = filter_instances(pose_instances, keep_new_indices)
+        new_dets.scores = pose_instances.keypoint_scores[keep_new_indices].mean(axis=-1)
+        old_dets = None
+        if len(keep_old_indices) > 0:
+            old_dets = filter_instances(all_detections, keep_old_indices)
+        new_detections = process_image_with_SAM(
+            DotDict(bmp_config.sam2.prompting),
+            img.copy(),
+            sam2_model,
+            new_dets,
+            old_dets if old_dets is not None else None,
+        )
+        # Merge detections
+        if all_detections is None:
+            all_detections = new_detections
+        else:
+            all_detections = concat_instances(all_detections, new_dets)
+        # Step 4: Visualization
+        img_for_detection, rtmdet_r, _ = visualize_demo(
+            img.copy(),
+            all_detections,
+        )
+        if iteration == 0:
+            rtmdet_result = rtmdet_r
+    _, _, bmp_result = visualize_demo(
+        img.copy(),
+        all_detections,
+    )
+    # img: BGR -> RGB
+    rtmdet_result = rtmdet_result[..., ::-1]
+    bmp_result = bmp_result[..., ::-1]
+    return rtmdet_result, bmp_result
+with gr.Blocks() as app:
+    gr.Markdown("# BBoxMaskPose Image Demo")
+    gr.Markdown(
+        "Official demo for paper **Detection, Pose Estimation and Segmentation for Multiple Bodies: Closing the Virtuous Circle.** [ICCV 2025]"
+    )
+    gr.Markdown(
+        "For details, see the [project website](https://mirapurkrabek.github.io/BBox-Mask-Pose/) or [arXiv paper](https://arxiv.org/abs/2412.01562). "
+        "The demo showcases the capabilities of the BBoxMaskPose framework on any image. "
+        "If you want to play around with parameters, use the [GitHub demo](https://github.com/MiraPurkrabek/BBoxMaskPose). "
+        "Please note that due to HuggingFace restrictions, the demo runs much slower than the GitHub implementation."
+    )
+    with gr.Row():
+        with gr.Column():
+            original_image_input = gr.Image(type="numpy", label="Original Image")
+            submit_button = gr.Button("Run Inference")
+        with gr.Column():
+            output_standard = gr.Image(type="numpy", label="RTMDet-L + MaskPose-B")
+        with gr.Column():
+            output_sahi_sliced = gr.Image(type="numpy", label="BBoxMaskPose")
+    gr.Examples(
+        label="OCHuman examples",
+        examples=[
+            ["examples/004806.jpg"],
+            ["examples/005056.jpg"],
+            ["examples/004981.jpg"],
+            ["examples/004655.jpg"],
+            ["examples/004684.jpg"],
+            ["examples/004974.jpg"],
+            ["examples/004983.jpg"],
+            ["examples/005017.jpg"],
+            ["examples/004849.jpg"],
+        ],
+        inputs=[
+            original_image_input,
+        ],
+        outputs=[output_standard, output_sahi_sliced],
+        fn=process_image_with_BMP,
+        cache_examples=True,
+    )
+    gr.Examples(
+        label="In-the-wild examples",
+        examples=[
+            ["examples/prochazka_MMA.jpg"],
+            ["examples/riner_judo.jpg"],
+            ["examples/tackle3.jpg"],
+            ["examples/tackle1.jpg"],
+            ["examples/tackle2.jpg"],
+            ["examples/tackle5.jpg"],
+            ["examples/floorball_SKV_3.jpg"],
+            ["examples/santa_o_crop.jpg"],
+            ["examples/floorball_SKV_2.jpg"],
+        ],
+        inputs=[
+            original_image_input,
+        ],
+        outputs=[output_standard, output_sahi_sliced],
+        fn=process_image_with_BMP,
+        cache_examples=True,
+    )
+    submit_button.click(
+        fn=process_image_with_BMP,
+        inputs=[
+            original_image_input,
+        ],
+        outputs=[output_standard, output_sahi_sliced],
+    )
+# Launch the demo
+app.launch()

configs/README.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Configuration Files Overview
+This directory contains configuration files for reproducing experiments and running inference across different components of the BBoxMaskPose project.
+## Which configs are available?
+Here you can find configs setting-up hyperparameters of the whole loop.
+These are mainly:
+- How to prompt SAM
+- Which models to use (detection, pose, SAM)
+- How to chain models
+- ...
+For easier reference, the configs have the same names as in the supplementary material of the ICCV paper.
+So for example config [**bmp_D3.yaml**](bmp_D3.yaml) is the prompting experiment used in the BMP loop.
+For details, see Tabs. 6 - 8 of the supplementary.
+## Where are appropriate configs?
+- **/configs** (this folder)
+  - Hyperparameter configurations for the BMP loop experiments. Use these files to reproduce training and evaluation settings.
+- **/mmpose/configs**
+  - Configuration files for MMPose, following the same format and structure as MMPose v1.3.1. Supports models, datasets, and training pipelines.
+- **/sam2/configs**
+  - Configuration files for SAM2, matching the format and directory layout of the original SAM v2.1 repository. Use these for prompt-driven segmentation and related tasks.

configs/bmp_D3.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# BBoxMaskPose Hyperparameters from Experiment D3.
+# For details, see the paper: https://arxiv.org/abs/2412.01562, Tab 8. in the supplementary.
+# This configuration is good for the BMP loop as was used for most of the experiments.
+detector:
+  det_config: 'mmpose/configs/mmdet/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py'
+  det_checkpoint: 'https://huggingface.co/vrg-prague/BBoxMaskPose/resolve/main/rtmdet-ins-l-mask.pth'
+  # Detectors D and D' could be different.
+  det_prime_config: null
+  det_prime_checkpoint: null
+pose_estimator:
+  pose_config: 'mmpose/configs/MaskPose/ViTb-multi_mask.py'
+  pose_checkpoint: 'https://huggingface.co/vrg-prague/BBoxMaskPose/resolve/main/MaskPose-b.pth'
+sam2:
+  sam2_config: 'configs/samurai/sam2.1_hiera_b+.yaml'   # Use SAMURAI as it has img_size 1024 (SAM-2.1 has 512)
+  sam2_checkpoint: 'models/SAM/sam2.1_hiera_base_plus.pt'
+  prompting:
+    batch: False
+    use_bbox: False
+    num_pos_keypoints: 6
+    num_pos_keypoints_if_crowd: 6
+    num_neg_keypoints: 0
+    confidence_thr: 0.3
+    visibility_thr: 0.3
+    selection_method: 'distance+confidence'
+    extend_bbox: False
+    pose_mask_consistency: False
+    crowd_by_max_iou: False  # Determine if the instance is in the multi-body scenario. If yes, use different amount of keypoints and NO BBOX. If no, use bbox according to 'use_bbox' argument.
+    crop: False
+    exclusive_masks: True
+    ignore_small_bboxes: False
+num_bmp_iters: 2
+oks_nms_thr: 0.8

configs/bmp_J1.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+# BBoxMaskPose Hyperparameters from Experiment J1.
+# For details, see the paper: https://arxiv.org/abs/2412.01562, Tab 8. in the supplementary.
+# This configuration is good for getting extra AP points when the estimates are already good.
+# It is not recommended for the whole loop (as done here -- this is for the demo) but rather for
+# the det-pose-sam-pose studied in Tab. 4.
+detector:
+  det_config: 'mmpose/configs/mmdet/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py'
+  det_checkpoint: 'https://huggingface.co/vrg-prague/BBoxMaskPose/resolve/main/rtmdet-ins-l-mask.pth'
+  # Detectors D and D' could be different.
+  det_prime_config: null
+  det_prime_checkpoint: null
+pose_estimator:
+  pose_config: 'mmpose/configs/MaskPose/ViTb-multi_mask.py'
+  pose_checkpoint: 'https://huggingface.co/vrg-prague/BBoxMaskPose/resolve/main/MaskPose-b.pth'
+sam2:
+  sam2_config: 'configs/samurai/sam2.1_hiera_b+.yaml'   # Use SAMURAI as it has img_size 1024 (SAM-2.1 has 512)
+  sam2_checkpoint: 'models/SAM/sam2.1_hiera_base_plus.pt'
+  prompting:
+    batch: True
+    use_bbox: False
+    num_pos_keypoints: 4
+    num_pos_keypoints_if_crowd: 6
+    num_neg_keypoints: 0
+    confidence_thr: 0.5
+    visibility_thr: 0.5
+    selection_method: 'distance+confidence'
+    extend_bbox: False
+    pose_mask_consistency: False
+    crowd_by_max_iou: 0.5  # Determine if the instance is in the multi-body scenario. If yes, use different amount of keypoints and NO BBOX. If no, use bbox according to 'use_bbox' argument.
+    crop: False
+    exclusive_masks: True
+    ignore_small_bboxes: False
+num_bmp_iters: 2
+oks_nms_thr: 0.8

demo/bmp_demo.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+"""
+BMP Demo script: sequentially runs detection, pose estimation, SAM-based mask refinement, and visualization.
+Usage:
+    python bmp_demo.py <config.yaml> <input_image> [--output-root <dir>]
+"""
+import os
+import shutil
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+import mmcv
+import mmengine
+import numpy as np
+import yaml
+from demo_utils import DotDict, concat_instances, create_GIF, filter_instances, pose_nms, visualize_itteration
+from mm_utils import run_MMDetector, run_MMPose
+from mmdet.apis import init_detector
+from mmengine.logging import print_log
+from mmengine.structures import InstanceData
+from sam2_utils import prepare_model as prepare_sam2_model
+from sam2_utils import process_image_with_SAM
+from mmpose.apis import init_model as init_pose_estimator
+from mmpose.utils import adapt_mmdet_pipeline
+# Default thresholds
+DEFAULT_DET_CAT_ID: int = 0  # "person"
+DEFAULT_BBOX_THR: float = 0.3
+DEFAULT_NMS_THR: float = 0.3
+DEFAULT_KPT_THR: float = 0.3
+def parse_args() -> Namespace:
+    """
+    Parse command-line arguments for BMP demo.
+    Returns:
+        Namespace: Contains bmp_config (Path), input (Path), output_root (Path), device (str).
+    """
+    parser = ArgumentParser(description="BBoxMaskPose demo")
+    parser.add_argument("bmp_config", type=Path, help="Path to BMP YAML config file")
+    parser.add_argument("input", type=Path, help="Input image file")
+    parser.add_argument("--output-root", type=Path, default=None, help="Directory to save outputs (default: ./outputs)")
+    parser.add_argument("--device", type=str, default="cuda:0", help="Device for inference (e.g., cuda:0 or cpu)")
+    parser.add_argument("--create-gif", action="store_true", default=False, help="Create GIF of all BMP iterations")
+    args = parser.parse_args()
+    if args.output_root is None:
+        args.output_root = os.path.join(Path(__file__).parent, "outputs")
+    return args
+def parse_yaml_config(yaml_path: Path) -> DotDict:
+    """
+    Load BMP configuration from a YAML file.
+    Args:
+        yaml_path (Path): Path to YAML config.
+    Returns:
+        DotDict: Nested config dictionary.
+    """
+    with open(yaml_path, "r") as f:
+        cfg = yaml.safe_load(f)
+    return DotDict(cfg)
+def process_one_image(
+    args: Namespace,
+    bmp_config: DotDict,
+    img_path: Path,
+    detector: object,
+    detector_prime: object,
+    pose_estimator: object,
+    sam2_model: object,
+) -> InstanceData:
+    """
+    Run the full BMP pipeline on a single image: detection, pose, SAM mask refinement, and visualization.
+    Args:
+        args (Namespace): Parsed CLI arguments.
+        bmp_config (DotDict): Configuration parameters.
+        img_path (Path): Path to the input image.
+        detector: Primary MMDetection model.
+        detector_prime: Secondary MMDetection model for iterations.
+        pose_estimator: MMPose model for keypoint estimation.
+        sam2_model: SAM model for mask refinement.
+    Returns:
+        InstanceData: Final merged detections and refined masks.
+    """
+    # Load image
+    img = mmcv.imread(str(img_path), channel_order="bgr")
+    if img is None:
+        raise ValueError("Failed to read image from {}.".format(img_path))
+    # Prepare output directory
+    output_dir = os.path.join(args.output_root, img_path.stem)
+    shutil.rmtree(str(output_dir), ignore_errors=True)
+    mmengine.mkdir_or_exist(str(output_dir))
+    img_for_detection = img.copy()
+    all_detections = None
+    for iteration in range(bmp_config.num_bmp_iters):
+        print_log("BMP Iteration {}/{} started".format(iteration + 1, bmp_config.num_bmp_iters), logger="current")
+        # Step 1: Detection
+        det_instances = run_MMDetector(
+            detector if iteration == 0 else detector_prime,
+            img_for_detection,
+            det_cat_id=DEFAULT_DET_CAT_ID,
+            bbox_thr=DEFAULT_BBOX_THR,
+            nms_thr=DEFAULT_NMS_THR,
+        )
+        print_log("Detected {} instances".format(len(det_instances.bboxes)), logger="current")
+        if len(det_instances.bboxes) == 0:
+            print_log("No detections found, skipping.", logger="current")
+            continue
+        # Step 2: Pose estimation
+        pose_instances = run_MMPose(
+            pose_estimator,
+            img.copy(),
+            detections=det_instances,
+            kpt_thr=DEFAULT_KPT_THR,
+        )
+        # Restrict to first 17 COCO keypoints
+        pose_instances.keypoints = pose_instances.keypoints[:, :17, :]
+        pose_instances.keypoint_scores = pose_instances.keypoint_scores[:, :17]
+        pose_instances.keypoints = np.concatenate(
+            [pose_instances.keypoints, pose_instances.keypoint_scores[:, :, None]], axis=-1
+        )
+        # Step 3: Pose-NMS and SAM refinement
+        all_keypoints = (
+            pose_instances.keypoints
+            if all_detections is None
+            else np.concatenate([all_detections.keypoints, pose_instances.keypoints], axis=0)
+        )
+        all_bboxes = (
+            pose_instances.bboxes
+            if all_detections is None
+            else np.concatenate([all_detections.bboxes, pose_instances.bboxes], axis=0)
+        )
+        num_valid_kpts = np.sum(all_keypoints[:, :, 2] > bmp_config.sam2.prompting.confidence_thr, axis=1)
+        keep_indices = pose_nms(
+            DotDict({"confidence_thr": bmp_config.sam2.prompting.confidence_thr, "oks_thr": bmp_config.oks_nms_thr}),
+            image_kpts=all_keypoints,
+            image_bboxes=all_bboxes,
+            num_valid_kpts=num_valid_kpts,
+        )
+        keep_indices = sorted(keep_indices)  # Sort by original index
+        num_old_detections = 0 if all_detections is None else len(all_detections.bboxes)
+        keep_new_indices = [i - num_old_detections for i in keep_indices if i >= num_old_detections]
+        keep_old_indices = [i for i in keep_indices if i < num_old_detections]
+        if len(keep_new_indices) == 0:
+            print_log("No new instances passed pose NMS, skipping SAM refinement.", logger="current")
+            continue
+        # filter new detections and compute scores
+        new_dets = filter_instances(pose_instances, keep_new_indices)
+        new_dets.scores = pose_instances.keypoint_scores[keep_new_indices].mean(axis=-1)
+        old_dets = None
+        if len(keep_old_indices) > 0:
+            old_dets = filter_instances(all_detections, keep_old_indices)
+        print_log(
+            "Pose NMS reduced instances to {:d} ({:d}+{:d}) instances".format(
+                len(new_dets.bboxes) + num_old_detections, num_old_detections, len(new_dets.bboxes)
+            ),
+            logger="current",
+        )
+        new_detections = process_image_with_SAM(
+            DotDict(bmp_config.sam2.prompting),
+            img.copy(),
+            sam2_model,
+            new_dets,
+            old_dets if old_dets is not None else None,
+        )
+        # Merge detections
+        if all_detections is None:
+            all_detections = new_detections
+        else:
+            all_detections = concat_instances(all_detections, new_dets)
+        # Step 4: Visualization
+        img_for_detection = visualize_itteration(
+            img.copy(),
+            all_detections,
+            iteration_idx=iteration,
+            output_root=str(output_dir),
+            img_name=img_path.stem,
+        )
+        print_log("Iteration {} completed".format(iteration + 1), logger="current")
+    # Create GIF of iterations if requested
+    if args.create_gif:
+        image_file = os.path.join(output_dir, "{:s}.jpg".format(img_path.stem))
+        create_GIF(
+            img_path=str(image_file),
+            output_root=str(output_dir),
+            bmp_x=bmp_config.num_bmp_iters,
+        )
+    return all_detections
+def main() -> None:
+    """
+    Entry point for the BMP demo: loads models and processes one image.
+    """
+    args = parse_args()
+    bmp_config = parse_yaml_config(args.bmp_config)
+    # Ensure output root exists
+    mmengine.mkdir_or_exist(str(args.output_root))
+    # build detectors
+    detector = init_detector(bmp_config.detector.det_config, bmp_config.detector.det_checkpoint, device=args.device)
+    detector.cfg = adapt_mmdet_pipeline(detector.cfg)
+    if (
+        bmp_config.detector.det_config == bmp_config.detector.det_prime_config
+        and bmp_config.detector.det_checkpoint == bmp_config.detector.det_prime_checkpoint
+    ) or (bmp_config.detector.det_prime_config is None or bmp_config.detector.det_prime_checkpoint is None):
+        print_log("Using the same detector as D and D'", logger="current")
+        detector_prime = detector
+    else:
+        detector_prime = init_detector(
+            bmp_config.detector.det_prime_config, bmp_config.detector.det_prime_checkpoint, device=args.device
+        )
+        detector_prime.cfg = adapt_mmdet_pipeline(detector_prime.cfg)
+        print_log("Using a different detector for D'", logger="current")
+    # build pose estimator
+    pose_estimator = init_pose_estimator(
+        bmp_config.pose_estimator.pose_config,
+        bmp_config.pose_estimator.pose_checkpoint,
+        device=args.device,
+        cfg_options=dict(model=dict(test_cfg=dict(output_heatmaps=False))),
+    )
+    sam2 = prepare_sam2_model(
+        model_cfg=bmp_config.sam2.sam2_config,
+        model_checkpoint=bmp_config.sam2.sam2_checkpoint,
+    )
+    # Run inference on one image
+    _ = process_one_image(args, bmp_config, args.input, detector, detector_prime, pose_estimator, sam2)
+if __name__ == "__main__":
+    main()

demo/demo_utils.py ADDED Viewed

	@@ -0,0 +1,705 @@

+"""
+Utilities for the BMP demo:
+- Visualization of detections, masks, and poses
+- Mask and bounding-box processing
+- Pose non-maximum suppression (NMS)
+- Animated GIF creation of demo iterations
+"""
+import logging
+import os
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import cv2
+import numpy as np
+from mmengine.logging import print_log
+from mmengine.structures import InstanceData
+from pycocotools import mask as Mask
+from sam2.distinctipy import get_colors
+from tqdm import tqdm
+### Visualization hyperparameters
+MIN_CONTOUR_AREA: int = 50
+BBOX_WEIGHT: float = 0.9
+MASK_WEIGHT: float = 0.6
+BACK_MASK_WEIGHT: float = 0.6
+POSE_WEIGHT: float = 0.8
+"""
+posevis is our custom visualization library for pose estimation. For compatibility, we also provide a lite version that has fewer features but still reproduces visualization from the paper.
+"""
+try:
+    from posevis import pose_visualization
+except ImportError:
+    from .posevis_lite import pose_visualization
+class DotDict(dict):
+    """Dictionary with attribute access and nested dict wrapping."""
+    def __getattr__(self, name: str) -> any:
+        if name in self:
+            val = self[name]
+            if isinstance(val, dict):
+                val = DotDict(val)
+                self[name] = val
+            return val
+        raise AttributeError("No attribute named {!r}".format(name))
+    def __setattr__(self, name: str, value: any) -> None:
+        self[name] = value
+    def __delattr__(self, name: str) -> None:
+        if name in self:
+            del self[name]
+        else:
+            raise AttributeError("No attribute named {!r}".format(name))
+def filter_instances(instances: InstanceData, indices):
+    """
+    Return a new InstanceData containing only the entries of 'instances' at the given indices.
+    """
+    if instances is None:
+        return None
+    data = {}
+    # Attributes to filter
+    for attr in [
+        "bboxes",
+        "bbox_scores",
+        "keypoints",
+        "keypoint_scores",
+        "scores",
+        "pred_masks",
+        "refined_masks",
+        "sam_scores",
+        "sam_kpts",
+    ]:
+        if hasattr(instances, attr):
+            arr = getattr(instances, attr)
+            data[attr] = arr[indices] if arr is not None else None
+    return InstanceData(**data)
+def concat_instances(instances1: InstanceData, instances2: InstanceData):
+    """
+    Concatenate two InstanceData objects along the first axis, preserving order.
+    If instances1 or instances2 is None, returns the other.
+    """
+    if instances1 is None:
+        return instances2
+    if instances2 is None:
+        return instances1
+    data = {}
+    for attr in [
+        "bboxes",
+        "bbox_scores",
+        "keypoints",
+        "keypoint_scores",
+        "scores",
+        "pred_masks",
+        "refined_masks",
+        "sam_scores",
+        "sam_kpts",
+    ]:
+        arr1 = getattr(instances1, attr, None)
+        arr2 = getattr(instances2, attr, None)
+        if arr1 is None and arr2 is None:
+            continue
+        if arr1 is None:
+            data[attr] = arr2
+        elif arr2 is None:
+            data[attr] = arr1
+        else:
+            data[attr] = np.concatenate([arr1, arr2], axis=0)
+    return InstanceData(**data)
+def _visualize_predictions(
+    img: np.ndarray,
+    bboxes: np.ndarray,
+    scores: np.ndarray,
+    masks: List[Optional[List[np.ndarray]]],
+    poses: List[Optional[np.ndarray]],
+    vis_type: str = "mask",
+    mask_is_binary: bool = False,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Render bounding boxes, segmentation masks, and poses on the input image.
+    Args:
+        img (np.ndarray): BGR image of shape (H, W, 3).
+        bboxes (np.ndarray): Array of bounding boxes [x, y, w, h].
+        scores (np.ndarray): Confidence scores for each bbox.
+        masks (List[Optional[List[np.ndarray]]]): Polygon masks per instance.
+        poses (List[Optional[np.ndarray]]): Keypoint arrays per instance.
+        vis_type (str): Flags for visualization types separated by '+'.
+        mask_is_binary (bool): Whether input masks are binary arrays.
+    Returns:
+        Tuple[np.ndarray, np.ndarray]: The visualized image and color map.
+    """
+    vis_types = vis_type.split("+")
+    # # Filter-out small detections to make the visualization more clear
+    # new_bboxes = []
+    # new_scores = []
+    # new_masks = []
+    # new_poses = []
+    # size_thr = img.shape[0] * img.shape[1] * 0.01
+    # for bbox, score, mask, pose in zip(bboxes, scores, masks, poses):
+    #     area = mask.sum() # Assume binary mask. OK for demo purposes
+    #     if area > size_thr:
+    #         new_bboxes.append(bbox)
+    #         new_scores.append(score)
+    #         new_masks.append(mask)
+    #         new_poses.append(pose)
+    # bboxes = np.array(new_bboxes)
+    # scores = np.array(new_scores)
+    # masks = new_masks
+    # poses = new_poses
+    if mask_is_binary:
+        poly_masks: List[Optional[List[np.ndarray]]] = []
+        for binary_mask in masks:
+            if binary_mask is not None:
+                contours, _ = cv2.findContours(
+                    (binary_mask * 255).astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+                )
+                polys = [cnt.flatten() for cnt in contours if cv2.contourArea(cnt) >= MIN_CONTOUR_AREA]
+            else:
+                polys = None
+            poly_masks.append(polys)
+        masks = poly_masks  # type: ignore
+    # Exclude white, black, and green colors from the palette as they are not distinctive
+    colors = (np.array(get_colors(len(bboxes), exclude_colors=[(0, 1, 0), (.5, .5, .5), (0, 0, 0), (1, 1, 1)], rng=0)) * 255).astype(
+        int
+    )
+    if "inv-mask" in vis_types:
+        stencil = np.zeros_like(img)
+    for bbox, score, mask_poly, pose, color in zip(bboxes, scores, masks, poses, colors):
+        bbox = _update_bbox_by_mask(list(map(int, bbox)), mask_poly, img.shape)
+        color_list = color.tolist()
+        img_copy = img.copy()
+        if "bbox" in vis_types:
+            x, y, w, h = bbox
+            cv2.rectangle(img_copy, (x, y), (x + w, y + h), color_list, 2)
+            img = cv2.addWeighted(img, 1 - BBOX_WEIGHT, img_copy, BBOX_WEIGHT, 0)
+        if mask_poly is not None and "mask" in vis_types:
+            for seg in mask_poly:
+                seg_pts = np.array(seg).reshape(-1, 1, 2).astype(int)
+                cv2.fillPoly(img_copy, [seg_pts], color_list)
+            img = cv2.addWeighted(img, 1 - MASK_WEIGHT, img_copy, MASK_WEIGHT, 0)
+        if mask_poly is not None and "mask-out" in vis_types:
+            for seg in mask_poly:
+                seg_pts = np.array(seg).reshape(-1, 1, 2).astype(int)
+                cv2.fillPoly(img, [seg_pts], (0, 0, 0))
+        if mask_poly is not None and "inv-mask" in vis_types:
+            for seg in mask_poly:
+                seg = np.array(seg).reshape(-1, 1, 2).astype(int)
+                if cv2.contourArea(seg) < MIN_CONTOUR_AREA:
+                    continue
+                cv2.fillPoly(stencil, [seg], (255, 255, 255))
+        if pose is not None and "pose" in vis_types:
+            vis_img = pose_visualization(
+                img.copy(),
+                pose.reshape(-1, 3),
+                width_multiplier=8,
+                differ_individuals=True,
+                color=color_list,
+                keep_image_size=True,
+            )
+            img = cv2.addWeighted(img, 1 - POSE_WEIGHT, vis_img, POSE_WEIGHT, 0)
+    if "inv-mask" in vis_types:
+        img = cv2.addWeighted(img, 1 - BACK_MASK_WEIGHT, cv2.bitwise_and(img, stencil), BACK_MASK_WEIGHT, 0)
+    return img, colors
+def visualize_itteration(
+    img: np.ndarray, detections: Any, iteration_idx: int, output_root: Path, img_name: str, with_text: bool = True
+) -> Optional[np.ndarray]:
+    """
+    Generate and save visualization images for each BMP iteration.
+    Args:
+        img (np.ndarray): Original input image.
+        detections: InstanceData containing bboxes, scores, masks, keypoints.
+        iteration_idx (int): Current iteration index (0-based).
+        output_root (Path): Directory to save output images.
+        img_name (str): Base name of the image without extension.
+        with_text (bool): Whether to overlay text labels.
+    Returns:
+        Optional[np.ndarray]: The masked-out image if generated, else None.
+    """
+    bboxes = detections.bboxes
+    scores = detections.scores
+    pred_masks = detections.pred_masks
+    refined_masks = detections.refined_masks
+    keypoints = detections.keypoints
+    sam_kpts = detections.sam_kpts
+    masked_out = None
+    for vis_def in [
+        {"type": "bbox+mask", "masks": pred_masks, "label": "Detector (out)"},
+        {"type": "inv-mask", "masks": pred_masks, "label": "MaskPose (in)"},
+        {"type": "inv-mask+pose", "masks": pred_masks, "label": "MaskPose (out)"},
+        {"type": "mask", "masks": refined_masks, "label": "SAM Masks"},
+        {"type": "mask-out", "masks": refined_masks, "label": "Mask-Out"},
+        {"type": "pose", "masks": refined_masks, "label": "Final Poses"},
+    ]:
+        vis_img, colors = _visualize_predictions(
+            img.copy(), bboxes, scores, vis_def["masks"], keypoints, vis_type=vis_def["type"], mask_is_binary=True
+        )
+        if vis_def["type"] == "mask-out":
+            masked_out = vis_img
+        if with_text:
+            label = "BMP {:d}x: {}".format(iteration_idx + 1, vis_def["label"])
+            cv2.putText(vis_img, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 3)
+            cv2.putText(vis_img, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
+        out_path = os.path.join(
+            output_root, "{}_iter{}_{}.jpg".format(img_name, iteration_idx + 1, vis_def["label"].replace(" ", "_"))
+        )
+        cv2.imwrite(str(out_path), vis_img)
+    # Show prompting keypoints
+    tmp_img = img.copy()
+    for i, _ in enumerate(bboxes):
+        if len(sam_kpts[i]) > 0:
+            instance_color = colors[i].astype(int).tolist()
+            for kpt in sam_kpts[i]:
+                cv2.drawMarker(
+                    tmp_img,
+                    (int(kpt[0]), int(kpt[1])),
+                    instance_color,
+                    markerType=cv2.MARKER_CROSS,
+                    markerSize=20,
+                    thickness=3,
+                )
+                # Write the keypoint confidence next to the marker
+                cv2.putText(
+                    tmp_img,
+                    f"{kpt[2]:.2f}",
+                    (int(kpt[0]) + 10, int(kpt[1]) - 10),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    0.5,
+                    instance_color,
+                    1,
+                    cv2.LINE_AA,
+                )
+    if with_text:
+        text = "BMP {:d}x: SAM prompts".format(iteration_idx + 1)
+        cv2.putText(tmp_img, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 3, cv2.LINE_AA)
+        cv2.putText(tmp_img, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2, cv2.LINE_AA)
+    cv2.imwrite("{:s}/{:s}_iter{:d}_prompting_kpts.jpg".format(output_root, img_name, iteration_idx + 1), tmp_img)
+    return masked_out
+def visualize_demo(
+    img: np.ndarray, detections: Any,
+) -> Optional[np.ndarray]:
+    """
+    Generate and save visualization images for each BMP iteration.
+    Args:
+        img (np.ndarray): Original input image.
+        detections: InstanceData containing bboxes, scores, masks, keypoints.
+        iteration_idx (int): Current iteration index (0-based).
+        output_root (Path): Directory to save output images.
+        img_name (str): Base name of the image without extension.
+        with_text (bool): Whether to overlay text labels.
+    Returns:
+        Optional[np.ndarray]: The masked-out image if generated, else None.
+    """
+    bboxes = detections.bboxes
+    scores = detections.scores
+    pred_masks = detections.pred_masks
+    refined_masks = detections.refined_masks
+    keypoints = detections.keypoints
+    returns = []
+    for vis_def in [
+        {"type": "mask-out", "masks": refined_masks, "label": ""},
+        {"type": "mask+pose", "masks": pred_masks, "label": "RTMDet-L"},
+        {"type": "mask+pose", "masks": refined_masks, "label": "BMP"},
+    ]:
+        vis_img, colors = _visualize_predictions(
+            img.copy(), bboxes, scores, vis_def["masks"], keypoints, vis_type=vis_def["type"], mask_is_binary=True
+        )
+        returns.append(vis_img)
+    return returns
+def create_GIF(
+    img_path: Path,
+    output_root: Path,
+    bmp_x: int = 2,
+) -> None:
+    """
+    Compile iteration images into an animated GIF using ffmpeg.
+    Args:
+        img_path (Path): Path to a sample iteration image.
+        output_root (Path): Directory to save the GIF.
+        bmp_x (int): Number of BMP iterations.
+        duration_per_frame (int): Frame display duration in ms.
+    Raises:
+        RuntimeError: If ffmpeg is not available or images are missing.
+    """
+    display_dur = 1.5  # seconds
+    fade_dur = 1.0
+    fps = 10
+    scale_width = 300  # Resize width for GIF, height will be auto-scaled to maintain aspect ratio
+    # Check if ffmpeg is installed. If not, raise warning and return
+    if shutil.which("ffmpeg") is None:
+        print_log("FFMpeg is not installed. GIF creation will be skipped.", logger="current", level=logging.WARNING)
+        return
+    print_log("Creating GIF with FFmpeg...", logger="current")
+    dirname, filename = os.path.split(img_path)
+    img_name_wo_ext, _ = os.path.splitext(filename)
+    gif_image_names = [
+        "Detector_(out)",
+        "MaskPose_(in)",
+        "MaskPose_(out)",
+        "prompting_kpts",
+        "SAM_Masks",
+        "Mask-Out",
+    ]
+    # Create black image of the same size as the last image
+    last_img_path = os.path.join(dirname, "{}_iter1_{}".format(img_name_wo_ext, gif_image_names[0]) + ".jpg")
+    last_img = cv2.imread(last_img_path)
+    if last_img is None:
+        print_log("Could not read image {}.".format(last_img_path), logger="current", level=logging.ERROR)
+        return
+    black_img = np.zeros_like(last_img)
+    cv2.imwrite(os.path.join(dirname, "black_image.jpg"), black_img)
+    gif_images = []
+    for iter in range(bmp_x):
+        iter_img_path = os.path.join(dirname, "{}_iter{}_".format(img_name_wo_ext, iter + 1))
+        for img_name in gif_image_names:
+            if iter + 1 == bmp_x and img_name == "Mask-Out":
+                # Skip the last iteration's Mask-Out image
+                continue
+            img_file = "{}{}.jpg".format(iter_img_path, img_name)
+            if not os.path.exists(img_file):
+                print_log("{} does not exist, skipping.".format(img_file), logger="current", level=logging.WARNING)
+                continue
+            gif_images.append(img_file)
+    if len(gif_images) == 0:
+        print_log("No images found for GIF creation.", logger="current", level=logging.WARNING)
+        return
+    # Add 'before' and 'after' images
+    after1_img = os.path.join(dirname, "{}_iter{}_Final_Poses.jpg".format(img_name_wo_ext, bmp_x))
+    after2_img = os.path.join(dirname, "{}_iter{}_SAM_Masks.jpg".format(img_name_wo_ext, bmp_x))
+    # gif_images.append(os.path.join(dirname, "black_image.jpg"))  # Add black image at the end
+    gif_images.append(after1_img)
+    gif_images.append(after2_img)
+    gif_images.append(os.path.join(dirname, "black_image.jpg"))  # Add black image at the end
+    # Create a GIF from the images
+    gif_output_path = os.path.join(output_root, "{}_bmp_{}x.gif".format(img_name_wo_ext, bmp_x))
+    # 0. Make sure images exist and are divisible by 2
+    for img in gif_images:
+        if not os.path.exists(img):
+            print_log("Image {} does not exist, skipping GIF creation.".format(img), logger="current", level=logging.WARNING)
+            return
+        # Check if image dimensions are divisible by 2
+        img_data = cv2.imread(img)
+        if img_data.shape[1] % 2 != 0 or img_data.shape[0] % 2 != 0:
+            print_log(
+                "Image {} dimensions are not divisible by 2, resizing.".format(img),
+                logger="current",
+                level=logging.WARNING,
+            )
+            resized_img = cv2.resize(img_data, (img_data.shape[1] // 2 * 2, img_data.shape[0] // 2 * 2))
+            cv2.imwrite(img, resized_img)
+    # 1. inputs
+    in_args = []
+    for p in gif_images:
+        in_args += ["-loop", "1", "-t", str(display_dur), "-i", p]
+    # 2. build xfade chain
+    n = len(gif_images)
+    parts = []
+    for i in range(1, n):
+        # left label: first is input [0:v], then [v1], [v2], …
+        left = "[{}:v]".format(i - 1) if i == 1 else "[v{}]".format(i - 1)
+        right = "[{}:v]".format(i)
+        out = "[v{}]".format(i)
+        offset = (i - 1) * (display_dur + fade_dur) + display_dur
+        parts.append(
+            "{}{}xfade=transition=fade:".format(left, right)
+            + "duration={}:offset={:.3f}{}".format(fade_dur, offset, out)
+        )
+    filter_complex = ";".join(parts)
+    # 3. make MP4 slideshow
+    mp4 = "slideshow.mp4"
+    cmd1 = [
+        "ffmpeg",
+        "-loglevel",
+        "error",
+        "-v",
+        "quiet",
+        "-hide_banner",
+        "-y",
+        *in_args,
+        "-filter_complex",
+        filter_complex,
+        "-map",
+        "[v{}]".format(n - 1),
+        "-c:v",
+        "libx264",
+        "-pix_fmt",
+        "yuv420p",
+        mp4,
+    ]
+    subprocess.run(cmd1, check=True)
+    # 4. palette
+    palette = "palette.png"
+    vf = "fps={}".format(fps)
+    if scale_width:
+        vf += ",scale={}: -1:flags=lanczos".format(scale_width)
+    # 5. generate palette
+    subprocess.run(
+        [
+            "ffmpeg",
+            "-loglevel",
+            "error",
+            "-v",
+            "quiet",
+            "-hide_banner",
+            "-y",
+            "-i",
+            mp4,
+            "-vf",
+            vf + ",palettegen",
+            palette,
+        ],
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.PIPE,
+    )
+    # 6. build final GIF
+    subprocess.run(
+        [
+            "ffmpeg",
+            "-loglevel",
+            "error",
+            "-v",
+            "quiet",
+            "-hide_banner",
+            "-y",
+            "-i",
+            mp4,
+            "-i",
+            palette,
+            "-lavfi",
+            vf + "[x];[x][1:v]paletteuse",
+            gif_output_path,
+        ],
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.PIPE,
+    )
+    # Clean up temporary files
+    os.remove(mp4)
+    os.remove(palette)
+    os.remove(os.path.join(dirname, "black_image.jpg"))
+    print_log(f"GIF saved as '{gif_output_path}'", logger="current")
+def _update_bbox_by_mask(
+    bbox: List[int], mask_poly: Optional[List[List[int]]], image_shape: Tuple[int, int, int]
+) -> List[int]:
+    """
+    Adjust bounding box to tightly fit mask polygon.
+    Args:
+        bbox (List[int]): Original [x, y, w, h].
+        mask_poly (Optional[List[List[int]]]): Polygon coordinates.
+        image_shape (Tuple[int,int,int]): Image shape (H, W, C).
+    Returns:
+        List[int]: Updated [x, y, w, h] bounding box.
+    """
+    if mask_poly is None or len(mask_poly) == 0:
+        return bbox
+    mask_rle = Mask.frPyObjects(mask_poly, image_shape[0], image_shape[1])
+    mask_rle = Mask.merge(mask_rle)
+    bbox_segm_xywh = Mask.toBbox(mask_rle)
+    bbox_segm_xyxy = np.array(
+        [
+            bbox_segm_xywh[0],
+            bbox_segm_xywh[1],
+            bbox_segm_xywh[0] + bbox_segm_xywh[2],
+            bbox_segm_xywh[1] + bbox_segm_xywh[3],
+        ]
+    )
+    bbox = bbox_segm_xywh
+    return bbox.astype(int).tolist()
+def pose_nms(config: Any, image_kpts: np.ndarray, image_bboxes: np.ndarray, num_valid_kpts: np.ndarray) -> np.ndarray:
+    """
+    Perform OKS-based non-maximum suppression on detected poses.
+    Args:
+        config (Any): Configuration with confidence_thr and oks_thr.
+        image_kpts (np.ndarray): Detected keypoints of shape (N, K, 3).
+        image_bboxes (np.ndarray): Corresponding bboxes (N,4).
+        num_valid_kpts (np.ndarray): Count of valid keypoints per instance.
+    Returns:
+        np.ndarray: Indices of kept instances.
+    """
+    # Sort image kpts by average score - lowest first
+    # scores = image_kpts[:, :, 2].mean(axis=1)
+    # sort_idx = np.argsort(scores)
+    # image_kpts = image_kpts[sort_idx, :, :]
+    # Compute OKS between all pairs of poses
+    oks_matrix = np.zeros((image_kpts.shape[0], image_kpts.shape[0]))
+    for i in range(image_kpts.shape[0]):
+        for j in range(image_kpts.shape[0]):
+            gt_bbox_xywh = image_bboxes[i].copy()
+            gt_bbox_xyxy = gt_bbox_xywh.copy()
+            gt_bbox_xyxy[2:] += gt_bbox_xyxy[:2]
+            gt = {
+                "keypoints": image_kpts[i].copy(),
+                "bbox": gt_bbox_xyxy,
+                "area": gt_bbox_xywh[2] * gt_bbox_xywh[3],
+            }
+            dt = {"keypoints": image_kpts[j].copy(), "bbox": gt_bbox_xyxy}
+            gt["keypoints"][:, 2] = (gt["keypoints"][:, 2] > config.confidence_thr) * 2
+            oks = compute_oks(gt, dt)
+            if oks > 1:
+                breakpoint()
+            oks_matrix[i, j] = oks
+    np.fill_diagonal(oks_matrix, -1)
+    is_subset = oks_matrix > config.oks_thr
+    remove_instances = []
+    while is_subset.any():
+        # Find the pair with the highest OKS
+        i, j = np.unravel_index(np.argmax(oks_matrix), oks_matrix.shape)
+        # Keep the one with the highest number of keypoints
+        if num_valid_kpts[i] > num_valid_kpts[j]:
+            remove_idx = j
+        else:
+            remove_idx = i
+        # Remove the column from is_subset
+        oks_matrix[:, remove_idx] = 0
+        oks_matrix[remove_idx, j] = 0
+        remove_instances.append(remove_idx)
+        is_subset = oks_matrix > config.oks_thr
+    keep_instances = np.setdiff1d(np.arange(image_kpts.shape[0]), remove_instances)
+    return keep_instances
+def compute_oks(gt: Dict[str, Any], dt: Dict[str, Any], use_area: bool = True, per_kpt: bool = False) -> float:
+    """
+    Compute Object Keypoint Similarity (OKS) between ground-truth and detected poses.
+    Args:
+        gt (Dict): Ground-truth keypoints and bbox info.
+        dt (Dict): Detected keypoints and bbox info.
+        use_area (bool): Whether to normalize by GT area.
+        per_kpt (bool): Whether to return per-keypoint OKS array.
+    Returns:
+        float: OKS score or mean OKS.
+    """
+    sigmas = (
+        np.array([0.26, 0.25, 0.25, 0.35, 0.35, 0.79, 0.79, 0.72, 0.72, 0.62, 0.62, 1.07, 1.07, 0.87, 0.87, 0.89, 0.89])
+        / 10.0
+    )
+    vars = (sigmas * 2) ** 2
+    k = len(sigmas)
+    visibility_condition = lambda x: x > 0
+    g = np.array(gt["keypoints"]).reshape(k, 3)
+    xg = g[:, 0]
+    yg = g[:, 1]
+    vg = g[:, 2]
+    k1 = np.count_nonzero(visibility_condition(vg))
+    bb = gt["bbox"]
+    x0 = bb[0] - bb[2]
+    x1 = bb[0] + bb[2] * 2
+    y0 = bb[1] - bb[3]
+    y1 = bb[1] + bb[3] * 2
+    d = np.array(dt["keypoints"]).reshape((k, 3))
+    xd = d[:, 0]
+    yd = d[:, 1]
+    if k1 > 0:
+        # measure the per-keypoint distance if keypoints visible
+        dx = xd - xg
+        dy = yd - yg
+    else:
+        # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
+        z = np.zeros((k))
+        dx = np.max((z, x0 - xd), axis=0) + np.max((z, xd - x1), axis=0)
+        dy = np.max((z, y0 - yd), axis=0) + np.max((z, yd - y1), axis=0)
+    if use_area:
+        e = (dx**2 + dy**2) / vars / (gt["area"] + np.spacing(1)) / 2
+    else:
+        tmparea = gt["bbox"][3] * gt["bbox"][2] * 0.53
+        e = (dx**2 + dy**2) / vars / (tmparea + np.spacing(1)) / 2
+    if per_kpt:
+        oks = np.exp(-e)
+        if k1 > 0:
+            oks[~visibility_condition(vg)] = 0
+    else:
+        if k1 > 0:
+            e = e[visibility_condition(vg)]
+        oks = np.sum(np.exp(-e)) / e.shape[0]
+    return oks

demo/mm_utils.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+This module provides high-level interfaces to run MMDetection and MMPose
+models sequentially. Users can call run_MMDetector and run_MMPose from
+other scripts (e.g., bmp_demo.py) to perform object detection and
+pose estimation in a clean, modular fashion.
+"""
+import numpy as np
+from mmdet.apis import inference_detector
+from mmengine.structures import InstanceData
+from mmpose.apis import inference_topdown
+from mmpose.evaluation.functional import nms
+from mmpose.structures import merge_data_samples
+def run_MMDetector(detector, image, det_cat_id: int = 0, bbox_thr: float = 0.3, nms_thr: float = 0.3) -> InstanceData:
+    """
+    Run an MMDetection model to detect bounding boxes (and masks) in an image.
+    Args:
+        detector: An initialized MMDetection detector model.
+        image: Input image as file path or BGR numpy array.
+        det_cat_id: Category ID to filter detections (default is 0 for 'person').
+        bbox_thr: Minimum bounding box score threshold.
+        nms_thr: IoU threshold for Non-Maximum Suppression (NMS).
+    Returns:
+        InstanceData: A structure containing filtered bboxes, bbox_scores, and masks (if available).
+    """
+    # Run detection
+    det_result = inference_detector(detector, image)
+    pred_instances = det_result.pred_instances.cpu().numpy()
+    # Aggregate bboxes and scores into an (N, 5) array
+    bboxes_all = np.concatenate((pred_instances.bboxes, pred_instances.scores[:, None]), axis=1)
+    # Filter by category and score
+    keep_mask = np.logical_and(pred_instances.labels == det_cat_id, pred_instances.scores > bbox_thr)
+    if not np.any(keep_mask):
+        # Return empty structure if nothing passes threshold
+        return InstanceData(bboxes=np.zeros((0, 4)), bbox_scores=np.zeros((0,)), masks=np.zeros((0, 1, 1)))
+    bboxes = bboxes_all[keep_mask]
+    masks = getattr(pred_instances, "masks", None)
+    if masks is not None:
+        masks = masks[keep_mask]
+    # Sort detections by descending score
+    order = np.argsort(bboxes[:, 4])[::-1]
+    bboxes = bboxes[order]
+    if masks is not None:
+        masks = masks[order]
+    # Apply Non-Maximum Suppression
+    keep_indices = nms(bboxes, nms_thr)
+    bboxes = bboxes[keep_indices]
+    if masks is not None:
+        masks = masks[keep_indices]
+    # Construct InstanceData to return
+    det_instances = InstanceData(bboxes=bboxes[:, :4], bbox_scores=bboxes[:, 4], masks=masks)
+    return det_instances
+def run_MMPose(pose_estimator, image, detections: InstanceData, kpt_thr: float = 0.3) -> InstanceData:
+    """
+    Run an MMPose top-down model to estimate human pose given detected bounding boxes.
+    Args:
+        pose_estimator: An initialized MMPose model.
+        image: Input image as file path or RGB/BGR numpy array.
+        detections: InstanceData from run_MMDetector containing bboxes and masks.
+        kpt_thr: Minimum keypoint score threshold to filter low-confidence joints.
+    Returns:
+        InstanceData: A structure containing estimated keypoints, keypoint_scores,
+                      original bboxes, and masks (if provided).
+    """
+    # Extract bounding boxes
+    bboxes = detections.bboxes
+    if bboxes.shape[0] == 0:
+        # No detections => empty pose data
+        return InstanceData(
+            keypoints=np.zeros((0, 17, 3)),
+            keypoint_scores=np.zeros((0, 17)),
+            bboxes=bboxes,
+            bbox_scores=detections.bbox_scores,
+            masks=detections.masks,
+        )
+    # Run top-down pose estimation
+    pose_results = inference_topdown(pose_estimator, image, bboxes, masks=detections.masks)
+    data_samples = merge_data_samples(pose_results)
+    # Attach masks back into the data_samples if available
+    if detections.masks is not None:
+        data_samples.pred_instances.pred_masks = detections.masks
+    # Filter out low-confidence keypoints
+    kp_scores = data_samples.pred_instances.keypoint_scores
+    kp_mask = kp_scores >= kpt_thr
+    # data_samples.pred_instances.keypoints[~kp_mask] = [0, 0, 0]
+    # Return final InstanceData for poses
+    return data_samples.pred_instances

demo/posevis_lite.py ADDED Viewed

	@@ -0,0 +1,507 @@

+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+import cv2
+import numpy as np
+NEUTRAL_COLOR = (52, 235, 107)
+LEFT_ARM_COLOR = (216, 235, 52)
+LEFT_LEG_COLOR = (235, 107, 52)
+LEFT_SIDE_COLOR = (245, 188, 113)
+LEFT_FACE_COLOR = (235, 52, 107)
+RIGHT_ARM_COLOR = (52, 235, 216)
+RIGHT_LEG_COLOR = (52, 107, 235)
+RIGHT_SIDE_COLOR = (52, 171, 235)
+RIGHT_FACE_COLOR = (107, 52, 235)
+COCO_MARKERS = [
+    ["nose", cv2.MARKER_CROSS, NEUTRAL_COLOR],
+    ["left_eye", cv2.MARKER_SQUARE, LEFT_FACE_COLOR],
+    ["right_eye", cv2.MARKER_SQUARE, RIGHT_FACE_COLOR],
+    ["left_ear", cv2.MARKER_CROSS, LEFT_FACE_COLOR],
+    ["right_ear", cv2.MARKER_CROSS, RIGHT_FACE_COLOR],
+    ["left_shoulder", cv2.MARKER_TRIANGLE_UP, LEFT_ARM_COLOR],
+    ["right_shoulder", cv2.MARKER_TRIANGLE_UP, RIGHT_ARM_COLOR],
+    ["left_elbow", cv2.MARKER_SQUARE, LEFT_ARM_COLOR],
+    ["right_elbow", cv2.MARKER_SQUARE, RIGHT_ARM_COLOR],
+    ["left_wrist", cv2.MARKER_CROSS, LEFT_ARM_COLOR],
+    ["right_wrist", cv2.MARKER_CROSS, RIGHT_ARM_COLOR],
+    ["left_hip", cv2.MARKER_TRIANGLE_UP, LEFT_LEG_COLOR],
+    ["right_hip", cv2.MARKER_TRIANGLE_UP, RIGHT_LEG_COLOR],
+    ["left_knee", cv2.MARKER_SQUARE, LEFT_LEG_COLOR],
+    ["right_knee", cv2.MARKER_SQUARE, RIGHT_LEG_COLOR],
+    ["left_ankle", cv2.MARKER_TILTED_CROSS, LEFT_LEG_COLOR],
+    ["right_ankle", cv2.MARKER_TILTED_CROSS, RIGHT_LEG_COLOR],
+]
+COCO_SKELETON = [
+    [[16, 14], LEFT_LEG_COLOR],  # Left ankle - Left knee
+    [[14, 12], LEFT_LEG_COLOR],  # Left knee - Left hip
+    [[17, 15], RIGHT_LEG_COLOR],  # Right ankle - Right knee
+    [[15, 13], RIGHT_LEG_COLOR],  # Right knee - Right hip
+    [[12, 13], NEUTRAL_COLOR],  # Left hip - Right hip
+    [[6, 12], LEFT_SIDE_COLOR],  # Left hip - Left shoulder
+    [[7, 13], RIGHT_SIDE_COLOR],  # Right hip - Right shoulder
+    [[6, 7], NEUTRAL_COLOR],  # Left shoulder - Right shoulder
+    [[6, 8], LEFT_ARM_COLOR],  # Left shoulder - Left elbow
+    [[7, 9], RIGHT_ARM_COLOR],  # Right shoulder - Right elbow
+    [[8, 10], LEFT_ARM_COLOR],  # Left elbow - Left wrist
+    [[9, 11], RIGHT_ARM_COLOR],  # Right elbow - Right wrist
+    [[2, 3], NEUTRAL_COLOR],  # Left eye - Right eye
+    [[1, 2], LEFT_FACE_COLOR],  # Nose - Left eye
+    [[1, 3], RIGHT_FACE_COLOR],  # Nose - Right eye
+    [[2, 4], LEFT_FACE_COLOR],  # Left eye - Left ear
+    [[3, 5], RIGHT_FACE_COLOR],  # Right eye - Right ear
+    [[4, 6], LEFT_FACE_COLOR],  # Left ear - Left shoulder
+    [[5, 7], RIGHT_FACE_COLOR],  # Right ear - Right shoulder
+]
+def _draw_line(
+    img: np.ndarray,
+    start: Tuple[float, float],
+    stop: Tuple[float, float],
+    color: Tuple[int, int, int],
+    line_type: str,
+    thickness: int = 1,
+) -> np.ndarray:
+    """
+    Draw a line segment on an image, supporting solid, dashed, or dotted styles.
+    Args:
+        img (np.ndarray): BGR image of shape (H, W, 3).
+        start (tuple of float): (x, y) start coordinates.
+        stop (tuple of float): (x, y) end coordinates.
+        color (tuple of int): BGR color values.
+        line_type (str): One of 'solid', 'dashed', or 'doted'.
+        thickness (int): Line thickness in pixels.
+    Returns:
+        np.ndarray: Image with the line drawn.
+    """
+    start = np.array(start)[:2]
+    stop = np.array(stop)[:2]
+    if line_type.lower() == "solid":
+        img = cv2.line(
+            img,
+            (int(start[0]), int(start[1])),
+            (int(stop[0]), int(stop[1])),
+            color=(0, 0, 0),
+            thickness=thickness+1,
+            lineType=cv2.LINE_AA,
+        )
+        img = cv2.line(
+            img,
+            (int(start[0]), int(start[1])),
+            (int(stop[0]), int(stop[1])),
+            color=color,
+            thickness=thickness,
+            lineType=cv2.LINE_AA,
+        )
+    elif line_type.lower() == "dashed":
+        delta = stop - start
+        length = np.linalg.norm(delta)
+        frac = np.linspace(0, 1, num=int(length / 5), endpoint=True)
+        for i in range(0, len(frac) - 1, 2):
+            s = start + frac[i] * delta
+            e = start + frac[i + 1] * delta
+            img = cv2.line(
+                img,
+                (int(s[0]), int(s[1])),
+                (int(e[0]), int(e[1])),
+                color=color,
+                thickness=thickness,
+                lineType=cv2.LINE_AA,
+            )
+    elif line_type.lower() == "doted":
+        delta = stop - start
+        length = np.linalg.norm(delta)
+        frac = np.linspace(0, 1, num=int(length / 5), endpoint=True)
+        for i in range(0, len(frac)):
+            s = start + frac[i] * delta
+            img = cv2.circle(
+                img,
+                (int(s[0]), int(s[1])),
+                radius=max(thickness // 2, 1),
+                color=color,
+                thickness=-1,
+                lineType=cv2.LINE_AA,
+            )
+    return img
+def pose_visualization(
+    img: Union[str, np.ndarray],
+    keypoints: Union[Dict[str, Any], np.ndarray],
+    format: str = "COCO",
+    greyness: float = 1.0,
+    show_markers: bool = True,
+    show_bones: bool = True,
+    line_type: str = "solid",
+    width_multiplier: float = 1.0,
+    bbox_width_multiplier: float = 1.0,
+    show_bbox: bool = False,
+    differ_individuals: bool = False,
+    confidence_thr: float = 0.3,
+    errors: Optional[np.ndarray] = None,
+    color: Optional[Tuple[int, int, int]] = None,
+    keep_image_size: bool = False,
+    return_padding: bool = False,
+) -> Union[np.ndarray, Tuple[np.ndarray, List[int]]]:
+    """
+    Overlay pose keypoints and skeleton on an image.
+    Args:
+        img (str or np.ndarray): Path to image file or BGR image array.
+        keypoints (dict or np.ndarray): Either a dict with 'bbox' and 'keypoints' or
+            an array of shape (17, 2 or 3) or multiple poses stacked.
+        format (str): Keypoint format, currently only 'COCO'.
+        greyness (float): Factor for bone/marker color intensity (0.0-1.0).
+        show_markers (bool): Whether to draw keypoint markers.
+        show_bones (bool): Whether to draw skeleton bones.
+        line_type (str): One of 'solid', 'dashed', 'doted' for bone style.
+        width_multiplier (float): Line width scaling factor for bones.
+        bbox_width_multiplier (float): Line width scaling factor for bounding box.
+        show_bbox (bool): Whether to draw bounding box around keypoints.
+        differ_individuals (bool): Use distinct color per individual pose.
+        confidence_thr (float): Confidence threshold for keypoint visibility.
+        errors (np.ndarray or None): Optional array of per-kpt errors (17,1).
+        color (tuple or None): Override color for markers and bones.
+        keep_image_size (bool): Prevent image padding for out-of-bounds keypoints.
+        return_padding (bool): If True, also return padding offsets [top,bottom,left,right].
+    Returns:
+        np.ndarray or (np.ndarray, list of int): Annotated image, and optional
+            padding offsets if `return_padding` is True.
+    """
+    bbox = None
+    if isinstance(keypoints, dict):
+        try:
+            bbox = np.array(keypoints["bbox"]).flatten()
+        except KeyError:
+            pass
+        keypoints = np.array(keypoints["keypoints"])
+    # If keypoints is a list of poses, draw them all
+    if len(keypoints) % 17 != 0 or keypoints.ndim == 3:
+        if color is not None:
+            if not isinstance(color, (list, tuple)):
+                color = [color for keypoint in keypoints]
+        else:
+            color = [None for keypoint in keypoints]
+        max_padding = [0, 0, 0, 0]
+        for keypoint, clr in zip(keypoints, color):
+            img = pose_visualization(
+                img,
+                keypoint,
+                format=format,
+                greyness=greyness,
+                show_markers=show_markers,
+                show_bones=show_bones,
+                line_type=line_type,
+                width_multiplier=width_multiplier,
+                bbox_width_multiplier=bbox_width_multiplier,
+                show_bbox=show_bbox,
+                differ_individuals=differ_individuals,
+                color=clr,
+                confidence_thr=confidence_thr,
+                keep_image_size=keep_image_size,
+                return_padding=return_padding,
+            )
+            if return_padding:
+                img, padding = img
+                max_padding = [max(max_padding[i], int(padding[i])) for i in range(4)]
+        if return_padding:
+            return img, max_padding
+        else:
+            return img
+    keypoints = np.array(keypoints).reshape(17, -1)
+    # If keypoint visibility is not provided, assume all keypoints are visible
+    if keypoints.shape[1] == 2:
+        keypoints = np.hstack([keypoints, np.ones((17, 1)) * 2])
+    assert keypoints.shape[1] == 3, "Keypoints should be in the format (x, y, visibility)"
+    assert keypoints.shape[0] == 17, "Keypoints should be in the format (x, y, visibility)"
+    if errors is not None:
+        errors = np.array(errors).reshape(17, -1)
+        assert errors.shape[1] == 1, "Errors should be in the format (K, r)"
+        assert errors.shape[0] == 17, "Errors should be in the format (K, r)"
+    else:
+        errors = np.ones((17, 1)) * np.nan
+    # If keypoint visibility is float between 0 and 1, it is detection
+    # If conf < confidence_thr: conf = 1
+    # If conf >= confidence_thr: conf = 2
+    vis_is_float = np.any(np.logical_and(keypoints[:, -1] > 0, keypoints[:, -1] < 1))
+    if keypoints.shape[1] == 3 and vis_is_float:
+        # print("before", keypoints[:, -1])
+        lower_idx = keypoints[:, -1] < confidence_thr
+        keypoints[lower_idx, -1] = 1
+        keypoints[~lower_idx, -1] = 2
+        # print("after", keypoints[:, -1])
+        # print("-"*20)
+    # All visibility values should be ints
+    keypoints[:, -1] = keypoints[:, -1].astype(int)
+    if isinstance(img, str):
+        img = cv2.imread(img)
+    if img is None:
+        if return_padding:
+            return None, [0, 0, 0, 0]
+        else:
+            return None
+    if not (keypoints[:, 2] > 0).any():
+        if return_padding:
+            return img, [0, 0, 0, 0]
+        else:
+            return img
+    valid_kpts = (keypoints[:, 0] > 0) & (keypoints[:, 1] > 0)
+    num_valid_kpts = np.sum(valid_kpts)
+    if num_valid_kpts == 0:
+        if return_padding:
+            return img, [0, 0, 0, 0]
+        else:
+            return img
+    min_x_kpts = np.min(keypoints[keypoints[:, 2] > 0, 0])
+    min_y_kpts = np.min(keypoints[keypoints[:, 2] > 0, 1])
+    max_x_kpts = np.max(keypoints[keypoints[:, 2] > 0, 0])
+    max_y_kpts = np.max(keypoints[keypoints[:, 2] > 0, 1])
+    if bbox is None:
+        min_x = min_x_kpts
+        min_y = min_y_kpts
+        max_x = max_x_kpts
+        max_y = max_y_kpts
+    else:
+        min_x = bbox[0]
+        min_y = bbox[1]
+        max_x = bbox[2]
+        max_y = bbox[3]
+    max_area = (max_x - min_x) * (max_y - min_y)
+    diagonal = np.sqrt((max_x - min_x) ** 2 + (max_y - min_y) ** 2)
+    line_width = max(int(np.sqrt(max_area) / 500 * width_multiplier), 1)
+    bbox_line_width = max(int(np.sqrt(max_area) / 500 * bbox_width_multiplier), 1)
+    marker_size = max(int(np.sqrt(max_area) / 80), 1)
+    invisible_marker_size = max(int(np.sqrt(max_area) / 100), 1)
+    marker_thickness = max(int(np.sqrt(max_area) / 100), 1)
+    if differ_individuals:
+        if color is not None:
+            instance_color = color
+        else:
+            instance_color = np.random.randint(0, 255, size=(3,)).tolist()
+            instance_color = tuple(instance_color)
+    # Pad image with dark gray if keypoints are outside the image
+    if not keep_image_size:
+        padding = [
+            max(0, -min_y_kpts),
+            max(0, max_y_kpts - img.shape[0]),
+            max(0, -min_x_kpts),
+            max(0, max_x_kpts - img.shape[1]),
+        ]
+        padding = [int(p) for p in padding]
+        img = cv2.copyMakeBorder(
+            img,
+            padding[0],
+            padding[1],
+            padding[2],
+            padding[3],
+            cv2.BORDER_CONSTANT,
+            value=(80, 80, 80),
+        )
+        # Add padding to bbox and kpts
+        value_x_to_add = max(0, -min_x_kpts)
+        value_y_to_add = max(0, -min_y_kpts)
+        keypoints[keypoints[:, 2] > 0, 0] += value_x_to_add
+        keypoints[keypoints[:, 2] > 0, 1] += value_y_to_add
+        if bbox is not None:
+            bbox[0] += value_x_to_add
+            bbox[1] += value_y_to_add
+            bbox[2] += value_x_to_add
+            bbox[3] += value_y_to_add
+    if show_bbox and not (bbox is None):
+        pts = [
+            (bbox[0], bbox[1]),
+            (bbox[0], bbox[3]),
+            (bbox[2], bbox[3]),
+            (bbox[2], bbox[1]),
+            (bbox[0], bbox[1]),
+        ]
+        for i in range(len(pts) - 1):
+            if differ_individuals:
+                img = _draw_line(img, pts[i], pts[i + 1], instance_color, "doted", thickness=bbox_line_width)
+            else:
+                img = _draw_line(img, pts[i], pts[i + 1], (0, 255, 0), line_type, thickness=bbox_line_width)
+    if show_markers:
+        for kpt, marker_info, err in zip(keypoints, COCO_MARKERS, errors):
+            if kpt[0] == 0 and kpt[1] == 0:
+                continue
+            if kpt[2] != 2:
+                color = (140, 140, 140)
+            elif differ_individuals:
+                color = instance_color
+            else:
+                color = marker_info[2]
+            if kpt[2] == 1:
+                img_overlay = img.copy()
+                img_overlay = cv2.drawMarker(
+                    img_overlay,
+                    (int(kpt[0]), int(kpt[1])),
+                    color=color,
+                    markerType=marker_info[1],
+                    markerSize=marker_size,
+                    thickness=marker_thickness,
+                )
+                img = cv2.addWeighted(img_overlay, 0.4, img, 0.6, 0)
+            else:
+                img = cv2.drawMarker(
+                    img,
+                    (int(kpt[0]), int(kpt[1])),
+                    color=color,
+                    markerType=marker_info[1],
+                    markerSize=invisible_marker_size if kpt[2] == 1 else marker_size,
+                    thickness=marker_thickness,
+                )
+            if not np.isnan(err).any():
+                radius = err * diagonal
+                clr = (0, 0, 255) if "solid" in line_type else (0, 255, 0)
+                plus = 1 if "solid" in line_type else -1
+                img = cv2.circle(
+                    img,
+                    (int(kpt[0]), int(kpt[1])),
+                    radius=int(radius),
+                    color=clr,
+                    thickness=1,
+                    lineType=cv2.LINE_AA,
+                )
+                dx = np.sqrt(radius**2 / 2)
+                img = cv2.line(
+                    img,
+                    (int(kpt[0]), int(kpt[1])),
+                    (int(kpt[0] + plus * dx), int(kpt[1] - dx)),
+                    color=clr,
+                    thickness=1,
+                    lineType=cv2.LINE_AA,
+                )
+    if show_bones:
+        for bone_info in COCO_SKELETON:
+            kp1 = keypoints[bone_info[0][0] - 1, :]
+            kp2 = keypoints[bone_info[0][1] - 1, :]
+            if (kp1[0] == 0 and kp1[1] == 0) or (kp2[0] == 0 and kp2[1] == 0):
+                continue
+            dashed = kp1[2] == 1 or kp2[2] == 1
+            if differ_individuals:
+                color = np.array(instance_color)
+            else:
+                color = np.array(bone_info[1])
+            color = (color * greyness).astype(int).tolist()
+            if dashed:
+                img_overlay = img.copy()
+                img_overlay = _draw_line(img_overlay, kp1, kp2, color, line_type, thickness=line_width)
+                img = cv2.addWeighted(img_overlay, 0.4, img, 0.6, 0)
+            else:
+                img = _draw_line(img, kp1, kp2, color, line_type, thickness=line_width)
+    if return_padding:
+        return img, padding
+    else:
+        return img
+if __name__ == "__main__":
+    kpts = np.array(
+        [
+            344,
+            222,
+            2,
+            356,
+            211,
+            2,
+            330,
+            211,
+            2,
+            372,
+            220,
+            2,
+            309,
+            224,
+            2,
+            413,
+            279,
+            2,
+            274,
+            300,
+            2,
+            444,
+            372,
+            2,
+            261,
+            396,
+            2,
+            398,
+            359,
+            2,
+            316,
+            372,
+            2,
+            407,
+            489,
+            2,
+            185,
+            580,
+            2,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ]
+    )
+    kpts = kpts.reshape(-1, 3)
+    kpts[:, -1] = np.random.randint(1, 3, size=(17,))
+    img = pose_visualization("demo/posevis_test.jpg", kpts, show_markers=True, line_type="solid")
+    kpts2 = kpts.copy()
+    kpts2[kpts2[:, 1] > 0, :2] += 10
+    img = pose_visualization(img, kpts2, show_markers=False, line_type="doted")
+    os.makedirs("demo/outputs", exist_ok=True)
+    cv2.imwrite("demo/outputs/posevis_test_out.jpg", img)

demo/sam2_utils.py ADDED Viewed

	@@ -0,0 +1,714 @@

+"""
+SAM2 utilities for BMP demo:
+- Build and prepare SAM model
+- Convert poses to segmentation
+- Compute mask-pose consistency
+"""
+from typing import Any, List, Optional, Tuple
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from pycocotools import mask as Mask
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+# Threshold for keypoint validity in mask-pose consistency
+STRICT_KPT_THRESHOLD: float = 0.5
+def _validate_sam_args(sam_args):
+    """
+    Validate that all required sam_args attributes are present.
+    """
+    required = [
+        "crop",
+        "use_bbox",
+        "confidence_thr",
+        "ignore_small_bboxes",
+        "num_pos_keypoints",
+        "num_pos_keypoints_if_crowd",
+        "crowd_by_max_iou",
+        "batch",
+        "exclusive_masks",
+        "extend_bbox",
+        "pose_mask_consistency",
+        "visibility_thr",
+    ]
+    for param in required:
+        if not hasattr(sam_args, param):
+            raise AttributeError(f"Missing required arg {param} in sam_args")
+def _get_max_ious(bboxes: List[np.ndarray]) -> np.ndarray:
+    """
+    Compute maximum IoU for each bbox against others.
+    """
+    is_crowd = [0] * len(bboxes)
+    ious = Mask.iou(bboxes, bboxes, is_crowd)
+    mat = np.array(ious)
+    np.fill_diagonal(mat, 0)
+    return mat.max(axis=1)
+def _compute_one_mask_pose_consistency(
+    mask: np.ndarray, pos_keypoints: Optional[np.ndarray] = None, neg_keypoints: Optional[np.ndarray] = None
+) -> float:
+    """
+    Compute a consistency score between a mask and given keypoints.
+    Args:
+        mask (np.ndarray): Binary mask of shape (H, W).
+        pos_keypoints (Optional[np.ndarray]): Positive keypoints array (N, 3).
+        neg_keypoints (Optional[np.ndarray]): Negative keypoints array (M, 3).
+    Returns:
+        float: Weighted mean of positive and negative keypoint consistency.
+    """
+    if mask is None:
+        return 0.0
+    def _mean_inside(points: np.ndarray) -> float:
+        if points.size == 0:
+            return 0.0
+        pts_int = np.floor(points[:, :2]).astype(int)
+        pts_int[:, 0] = np.clip(pts_int[:, 0], 0, mask.shape[1] - 1)
+        pts_int[:, 1] = np.clip(pts_int[:, 1], 0, mask.shape[0] - 1)
+        vals = mask[pts_int[:, 1], pts_int[:, 0]]
+        return vals.mean() if vals.size > 0 else 0.0
+    pos_mean = 0.0
+    if pos_keypoints is not None:
+        valid = pos_keypoints[:, 2] > STRICT_KPT_THRESHOLD
+        pos_mean = _mean_inside(pos_keypoints[valid])
+    neg_mean = 0.0
+    if neg_keypoints is not None:
+        valid = neg_keypoints[:, 2] > STRICT_KPT_THRESHOLD
+        pts = neg_keypoints[valid][:, :2]
+        inside = mask[np.floor(pts[:, 1]).astype(int), np.floor(pts[:, 0]).astype(int)]
+        neg_mean = (~inside.astype(bool)).mean() if inside.size > 0 else 0.0
+    return 0.5 * pos_mean + 0.5 * neg_mean
+def _select_keypoints(
+    args: Any,
+    kpts: np.ndarray,
+    num_visible: int,
+    bbox: Optional[Tuple[float, float, float, float]] = None,
+    method: Optional[str] = "distance+confidence",
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Select and order keypoints for SAM prompting based on specified method.
+    Args:
+        args: Configuration object with selection_method and visibility_thr attributes.
+        kpts (np.ndarray): Keypoints array of shape (K, 3).
+        num_visible (int): Number of keypoints above visibility threshold.
+        bbox (Optional[Tuple]): Optional bbox for distance methods.
+        method (Optional[str]): Override selection method.
+    Returns:
+        Tuple[np.ndarray, np.ndarray]: Selected keypoint coordinates (N,2) and confidences (N,).
+    Raises:
+        ValueError: If an unknown method is specified.
+    """
+    if num_visible == 0:
+        return kpts[:, :2], kpts[:, 2]
+    methods = ["confidence", "distance", "distance+confidence", "closest"]
+    sel_method = method or args.selection_method
+    if sel_method not in methods:
+        raise ValueError("Unknown method for keypoint selection: {}".format(sel_method))
+    # Select at maximum keypoint from the face
+    facial_kpts = kpts[:3, :]
+    facial_conf = kpts[:3, 2]
+    facial_point = facial_kpts[np.argmax(facial_conf)]
+    if facial_point[-1] >= args.visibility_thr:
+        kpts = np.concatenate([facial_point[None, :], kpts[3:]], axis=0)
+    conf = kpts[:, 2]
+    vis_mask = conf >= args.visibility_thr
+    coords = kpts[vis_mask, :2]
+    confs = conf[vis_mask]
+    if sel_method == "confidence":
+        order = np.argsort(confs)[::-1]
+        coords = coords[order]
+        confs = confs[order]
+    elif sel_method == "distance":
+        if bbox is None:
+            bbox_center = np.array([coords[:, 0].mean(), coords[:, 1].mean()])
+        else:
+            bbox_center = np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2])
+        dists = np.linalg.norm(coords[:, :2] - bbox_center, axis=1)
+        dist_matrix = np.linalg.norm(coords[:, None, :2] - coords[None, :, :2], axis=2)
+        np.fill_diagonal(dist_matrix, np.inf)
+        min_inter_dist = np.min(dist_matrix, axis=1)
+        order = np.argsort(dists + 3 * min_inter_dist)[::-1]
+        coords = coords[order, :2]
+        confs = confs[order]
+    elif sel_method == "distance+confidence":
+        order = np.argsort(confs)[::-1]
+        confidences = kpts[order, 2]
+        coords = coords[order, :2]
+        confs = confs[order]
+        dist_matrix = np.linalg.norm(coords[:, None, :2] - coords[None, :, :2], axis=2)
+        selected_idx = [0]
+        confidences[0] = -1
+        for _ in range(coords.shape[0] - 1):
+            min_dist = np.min(dist_matrix[:, selected_idx], axis=1)
+            min_dist[confidences < np.percentile(confidences, 80)] = -1
+            next_idx = np.argmax(min_dist)
+            selected_idx.append(next_idx)
+            confidences[next_idx] = -1
+        coords = coords[selected_idx]
+        confs = confs[selected_idx]
+    elif sel_method == "closest":
+        coords = coords[confs > STRICT_KPT_THRESHOLD, :]
+        confs = confs[confs > STRICT_KPT_THRESHOLD]
+        if bbox is None:
+            bbox_center = np.array([coords[:, 0].mean(), coords[:, 1].mean()])
+        else:
+            bbox_center = np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2])
+        dists = np.linalg.norm(coords[:, :2] - bbox_center, axis=1)
+        order = np.argsort(dists)
+        coords = coords[order, :2]
+        confs = confs[order]
+    return coords, confs
+def prepare_model(model_cfg: Any, model_checkpoint: str) -> SAM2ImagePredictor:
+    """
+    Build and return a SAM2ImagePredictor model on the appropriate device.
+    Args:
+        model_cfg: Configuration for SAM2 model.
+        model_checkpoint (str): Path to model checkpoint.
+    Returns:
+        SAM2ImagePredictor: Initialized SAM2 image predictor.
+    """
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    sam2 = build_sam2(model_cfg, model_checkpoint, device=device, apply_postprocessing=True)
+    model = SAM2ImagePredictor(
+        sam2,
+        max_hole_area=10.0,
+        max_sprinkle_area=50.0,
+    )
+    return model
+def _compute_mask_pose_consistency(masks: List[np.ndarray], keypoints_list: List[np.ndarray]) -> np.ndarray:
+    """
+    Compute mask-pose consistency score for each mask-keypoints pair.
+    Args:
+        masks (List[np.ndarray]): Binary masks list.
+        keypoints_list (List[np.ndarray]): List of keypoint arrays per instance.
+    Returns:
+        np.ndarray: Consistency scores array of shape (N,).
+    """
+    scores: List[float] = []
+    for mask, kpts in zip(masks, keypoints_list):
+        other_kpts = np.concatenate([keypoints_list[:idx], keypoints_list[idx + 1 :]], axis=0).reshape(-1, 3)
+        score = _compute_one_mask_pose_consistency(mask, kpts, other_kpts)
+        scores.append(score)
+    return np.array(scores)
+def _pose2seg(
+    args: Any,
+    model: SAM2ImagePredictor,
+    bbox_xyxy: Optional[List[float]] = None,
+    pos_kpts: Optional[np.ndarray] = None,
+    neg_kpts: Optional[np.ndarray] = None,
+    image: Optional[np.ndarray] = None,
+    gt_mask: Optional[Any] = None,
+    num_pos_keypoints: Optional[int] = None,
+    gt_mask_is_binary: bool = False,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float]:
+    """
+    Run SAM segmentation conditioned on pose keypoints and optional ground truth mask.
+    Args:
+        args: Configuration object with prompting settings.
+        model (SAM2ImagePredictor): Prepared SAM2 model.
+        bbox_xyxy (Optional[List[float]]): Bounding box coordinates in xyxy format.
+        pos_kpts (Optional[np.ndarray]): Positive keypoints array.
+        neg_kpts (Optional[np.ndarray]): Negative keypoints array.
+        image (Optional[np.ndarray]): Input image array.
+        gt_mask (Optional[Any]): Ground truth mask (optional).
+        num_pos_keypoints (Optional[int]): Number of positive keypoints to use.
+        gt_mask_is_binary (bool): Flag indicating if ground truth mask is binary.
+    Returns:
+        Tuple of (mask, pos_kpts_backup, neg_kpts_backup, score).
+    """
+    num_pos_keypoints = args.num_pos_keypoints if num_pos_keypoints is None else num_pos_keypoints
+    # Filter-out un-annotated and invisible keypoints
+    if pos_kpts is not None:
+        pos_kpts = pos_kpts.reshape(-1, 3)
+        valid_kpts = pos_kpts[:, 2] > args.visibility_thr
+        pose_bbox = np.array([pos_kpts[:, 0].min(), pos_kpts[:, 1].min(), pos_kpts[:, 0].max(), pos_kpts[:, 1].max()])
+        pos_kpts, conf = _select_keypoints(args, pos_kpts, num_visible=valid_kpts.sum(), bbox=bbox_xyxy)
+        pos_kpts_backup = np.concatenate([pos_kpts, conf[:, None]], axis=1)
+        if pos_kpts.shape[0] > num_pos_keypoints:
+            pos_kpts = pos_kpts[:num_pos_keypoints, :]
+            pos_kpts_backup = pos_kpts_backup[:num_pos_keypoints, :]
+    else:
+        pose_bbox = None
+        pos_kpts = np.empty((0, 2), dtype=np.float32)
+        pos_kpts_backup = np.empty((0, 3), dtype=np.float32)
+    if neg_kpts is not None:
+        neg_kpts = neg_kpts.reshape(-1, 3)
+        valid_kpts = neg_kpts[:, 2] > args.visibility_thr
+        neg_kpts, conf = _select_keypoints(
+            args, neg_kpts, num_visible=valid_kpts.sum(), bbox=bbox_xyxy, method="closest"
+        )
+        selected_neg_kpts = neg_kpts
+        neg_kpts_backup = np.concatenate([neg_kpts, conf[:, None]], axis=1)
+        if neg_kpts.shape[0] > args.num_neg_keypoints:
+            selected_neg_kpts = neg_kpts[: args.num_neg_keypoints, :]
+    else:
+        selected_neg_kpts = np.empty((0, 2), dtype=np.float32)
+        neg_kpts_backup = np.empty((0, 3), dtype=np.float32)
+    # Concatenate positive and negative keypoints
+    kpts = np.concatenate([pos_kpts, selected_neg_kpts], axis=0)
+    kpts_labels = np.concatenate([np.ones(pos_kpts.shape[0]), np.zeros(selected_neg_kpts.shape[0])], axis=0)
+    bbox = bbox_xyxy if args.use_bbox else None
+    if args.extend_bbox and not bbox is None:
+        # Expand the bbox such that it contains all positive keypoints
+        pose_bbox = np.array(
+            [pos_kpts[:, 0].min() - 2, pos_kpts[:, 1].min() - 2, pos_kpts[:, 0].max() + 2, pos_kpts[:, 1].max() + 2]
+        )
+        expanded_bbox = np.array(bbox)
+        expanded_bbox[:2] = np.minimum(bbox[:2], pose_bbox[:2])
+        expanded_bbox[2:] = np.maximum(bbox[2:], pose_bbox[2:])
+        bbox = expanded_bbox
+    if args.crop and args.use_bbox and image is not None:
+        # Crop the image to the 1.5 * bbox size
+        crop_bbox = np.array(bbox)
+        bbox_center = np.array([(crop_bbox[0] + crop_bbox[2]) / 2, (crop_bbox[1] + crop_bbox[3]) / 2])
+        bbox_size = np.array([crop_bbox[2] - crop_bbox[0], crop_bbox[3] - crop_bbox[1]])
+        bbox_size = 1.5 * bbox_size
+        crop_bbox = np.array(
+            [
+                bbox_center[0] - bbox_size[0] / 2,
+                bbox_center[1] - bbox_size[1] / 2,
+                bbox_center[0] + bbox_size[0] / 2,
+                bbox_center[1] + bbox_size[1] / 2,
+            ]
+        )
+        crop_bbox = np.round(crop_bbox).astype(int)
+        crop_bbox = np.clip(crop_bbox, 0, [image.shape[1], image.shape[0], image.shape[1], image.shape[0]])
+        original_image_size = image.shape[:2]
+        image = image[crop_bbox[1] : crop_bbox[3], crop_bbox[0] : crop_bbox[2], :]
+        # Update the keypoints
+        kpts = kpts - crop_bbox[:2]
+        bbox[:2] = bbox[:2] - crop_bbox[:2]
+        bbox[2:] = bbox[2:] - crop_bbox[:2]
+        model.set_image(image)
+    masks, scores, logits = model.predict(
+        point_coords=kpts,
+        point_labels=kpts_labels,
+        box=bbox,
+        multimask_output=False,
+    )
+    mask = masks[0]
+    scores = scores[0]
+    if args.crop and args.use_bbox and image is not None:
+        # Pad the mask to the original image size
+        mask_padded = np.zeros(original_image_size, dtype=np.uint8)
+        mask_padded[crop_bbox[1] : crop_bbox[3], crop_bbox[0] : crop_bbox[2]] = mask
+        mask = mask_padded
+        bbox[:2] = bbox[:2] + crop_bbox[:2]
+        bbox[2:] = bbox[2:] + crop_bbox[:2]
+    if args.pose_mask_consistency:
+        if gt_mask_is_binary:
+            gt_mask_binary = gt_mask
+        else:
+            gt_mask_binary = Mask.decode(gt_mask).astype(bool) if gt_mask is not None else None
+        gt_mask_pose_consistency = _compute_one_mask_pose_consistency(gt_mask_binary, pos_kpts_backup, neg_kpts_backup)
+        dt_mask_pose_consistency = _compute_one_mask_pose_consistency(mask, pos_kpts_backup, neg_kpts_backup)
+        tol = 0.1
+        dt_is_same = np.abs(dt_mask_pose_consistency - gt_mask_pose_consistency) < tol
+        if dt_is_same:
+            mask = gt_mask_binary if gt_mask_binary.sum() < mask.sum() else mask
+        else:
+            mask = gt_mask_binary if gt_mask_pose_consistency > dt_mask_pose_consistency else mask
+    return mask, pos_kpts_backup, neg_kpts_backup, scores
+def process_image_with_SAM(
+    sam_args: Any,
+    image: np.ndarray,
+    model: SAM2ImagePredictor,
+    new_dets: InstanceData,
+    old_dets: Optional[InstanceData] = None,
+) -> InstanceData:
+    """
+    Wrapper that validates args and routes to single or batch processing.
+    """
+    _validate_sam_args(sam_args)
+    if sam_args.batch:
+        return _process_image_batch(sam_args, image, model, new_dets, old_dets)
+    return _process_image_single(sam_args, image, model, new_dets, old_dets)
+def _process_image_single(
+    sam_args: Any,
+    image: np.ndarray,
+    model: SAM2ImagePredictor,
+    new_dets: InstanceData,
+    old_dets: Optional[InstanceData] = None,
+) -> InstanceData:
+    """
+    Refine instance segmentation masks using SAM2 with pose-conditioned prompts.
+    Args:
+        sam_args (Any): DotDict containing required SAM parameters:
+            crop (bool), use_bbox (bool), confidence_thr (float),
+            ignore_small_bboxes (bool), num_pos_keypoints (int),
+            num_pos_keypoints_if_crowd (int), crowd_by_max_iou (Optional[float]),
+            batch (bool), exclusive_masks (bool), extend_bbox (bool), pose_mask_consistency (bool).
+        image (np.ndarray): BGR image array of shape (H, W, 3).
+        model (SAM2ImagePredictor): Initialized SAM2 predictor.
+        new_dets (InstanceData): New detections with attributes:
+            bboxes, pred_masks, keypoints, bbox_scores.
+        old_dets (Optional[InstanceData]): Previous detections for negative prompts.
+    Returns:
+        InstanceData: `new_dets` updated in-place with
+            `.refined_masks`, `.sam_scores`, and `.sam_kpts`.
+    """
+    _validate_sam_args(sam_args)
+    if not (sam_args.crop and sam_args.use_bbox):
+        model.set_image(image)
+    # Ignore all keypoints with confidence below the threshold
+    new_keypoints = new_dets.keypoints.copy()
+    for kpts in new_keypoints:
+        conf_mask = kpts[:, 2] < sam_args.confidence_thr
+        kpts[conf_mask, :] = 0
+    n_new_dets = len(new_dets.bboxes)
+    n_old_dets = 0
+    if old_dets is not None:
+        n_old_dets = len(old_dets.bboxes)
+        old_keypoints = old_dets.keypoints.copy()
+        for kpts in old_keypoints:
+            conf_mask = kpts[:, 2] < sam_args.confidence_thr
+            kpts[conf_mask, :] = 0
+    all_bboxes = new_dets.bboxes.copy()
+    if old_dets is not None:
+        all_bboxes = np.concatenate([all_bboxes, old_dets.bboxes], axis=0)
+    max_ious = _get_max_ious(all_bboxes)
+    gt_bboxes = []
+    new_dets.refined_masks = np.zeros((n_new_dets, image.shape[0], image.shape[1]), dtype=np.uint8)
+    new_dets.sam_scores = np.zeros_like(new_dets.bbox_scores)
+    new_dets.sam_kpts = np.zeros((len(new_dets.bboxes), sam_args.num_pos_keypoints, 3), dtype=np.float32)
+    for instance_idx in range(len(new_dets.bboxes)):
+        bbox_xywh = new_dets.bboxes[instance_idx]
+        bbox_area = bbox_xywh[2] * bbox_xywh[3]
+        if sam_args.ignore_small_bboxes and bbox_area < 100 * 100:
+            continue
+        dt_mask = new_dets.pred_masks[instance_idx] if new_dets.pred_masks is not None else None
+        bbox_xyxy = [bbox_xywh[0], bbox_xywh[1], bbox_xywh[0] + bbox_xywh[2], bbox_xywh[1] + bbox_xywh[3]]
+        gt_bboxes.append(bbox_xyxy)
+        this_kpts = new_keypoints[instance_idx].reshape(1, -1, 3)
+        other_kpts = None
+        if old_dets is not None:
+            other_kpts = old_keypoints.copy().reshape(n_old_dets, -1, 3)
+        if len(new_keypoints) > 1:
+            other_new_kpts = np.concatenate([new_keypoints[:instance_idx],  new_keypoints[instance_idx + 1 :]], axis=0)
+            other_kpts = (
+                np.concatenate([other_kpts, other_new_kpts], axis=0) if other_kpts is not None else other_new_kpts
+            )
+        num_pos_keypoints = sam_args.num_pos_keypoints
+        if sam_args.crowd_by_max_iou is not None and max_ious[instance_idx] > sam_args.crowd_by_max_iou:
+            bbox_xyxy = None
+            num_pos_keypoints = sam_args.num_pos_keypoints_if_crowd
+        dt_mask, pos_kpts, neg_kpts, scores = _pose2seg(
+            sam_args,
+            model,
+            bbox_xyxy,
+            pos_kpts=this_kpts,
+            neg_kpts=other_kpts,
+            image=image if (sam_args.crop and sam_args.use_bbox) else None,
+            gt_mask=dt_mask,
+            num_pos_keypoints=num_pos_keypoints,
+            gt_mask_is_binary=True,
+        )
+        new_dets.refined_masks[instance_idx] = dt_mask
+        new_dets.sam_scores[instance_idx] = scores
+        # If the number of positive keypoints is less than the required number, fill the rest with zeros
+        if len(pos_kpts) != sam_args.num_pos_keypoints:
+            pos_kpts = np.concatenate(
+                [pos_kpts, np.zeros((sam_args.num_pos_keypoints - len(pos_kpts), 3), dtype=np.float32)], axis=0
+            )
+        new_dets.sam_kpts[instance_idx] = pos_kpts
+    n_masks = len(new_dets.refined_masks) + (len(old_dets.refined_masks) if old_dets is not None else 0)
+    if sam_args.exclusive_masks and n_masks > 1:
+        all_masks = (
+            np.concatenate([new_dets.refined_masks, old_dets.refined_masks], axis=0)
+            if old_dets is not None
+            else new_dets.refined_masks
+        )
+        all_scores = (
+            np.concatenate([new_dets.sam_scores, old_dets.sam_scores], axis=0)
+            if old_dets is not None
+            else new_dets.sam_scores
+        )
+        refined_masks = _apply_exclusive_masks(all_masks, all_scores)
+        new_dets.refined_masks = refined_masks[: len(new_dets.refined_masks)]
+    return new_dets
+def _process_image_batch(
+    sam_args: Any,
+    image: np.ndarray,
+    model: SAM2ImagePredictor,
+    new_dets: InstanceData,
+    old_dets: Optional[InstanceData] = None,
+) -> InstanceData:
+    """
+    Batch process multiple detection instances with SAM2 refinement.
+    Args:
+        sam_args (Any): DotDict of SAM parameters (same as `process_image_with_SAM`).
+        image (np.ndarray): Input BGR image.
+        model (SAM2ImagePredictor): Prepared SAM2 predictor.
+        new_dets (InstanceData): New detection instances.
+        old_dets (Optional[InstanceData]): Previous detections for negative prompts.
+    Returns:
+        InstanceData: `new_dets` updated as in `process_image_with_SAM`.
+    """
+    n_new_dets = len(new_dets.bboxes)
+    model.set_image(image)
+    image_kpts = []
+    image_bboxes = []
+    num_valid_kpts = []
+    for instance_idx in range(len(new_dets.bboxes)):
+        bbox_xywh = new_dets.bboxes[instance_idx].copy()
+        bbox_area = bbox_xywh[2] * bbox_xywh[3]
+        if sam_args.ignore_small_bboxes and bbox_area < 100 * 100:
+            continue
+        this_kpts = new_dets.keypoints[instance_idx].copy().reshape(-1, 3)
+        kpts_vis = np.array(this_kpts[:, 2])
+        visible_kpts = (kpts_vis > sam_args.visibility_thr) & (this_kpts[:, 2] > sam_args.confidence_thr)
+        num_visible = (visible_kpts).sum()
+        if num_visible <= 0:
+            continue
+        num_valid_kpts.append(num_visible)
+        image_bboxes.append(np.array(bbox_xywh))
+        this_kpts[~visible_kpts, :2] = 0
+        this_kpts[:, 2] = visible_kpts
+        image_kpts.append(this_kpts)
+    if old_dets is not None:
+        for instance_idx in range(len(old_dets.bboxes)):
+            bbox_xywh = old_dets.bboxes[instance_idx].copy()
+            bbox_area = bbox_xywh[2] * bbox_xywh[3]
+            if sam_args.ignore_small_bboxes and bbox_area < 100 * 100:
+                continue
+            this_kpts = old_dets.keypoints[instance_idx].reshape(-1, 3)
+            kpts_vis = np.array(this_kpts[:, 2])
+            visible_kpts = (kpts_vis > sam_args.visibility_thr) & (this_kpts[:, 2] > sam_args.confidence_thr)
+            num_visible = (visible_kpts).sum()
+            if num_visible <= 0:
+                continue
+            num_valid_kpts.append(num_visible)
+            image_bboxes.append(np.array(bbox_xywh))
+            this_kpts[~visible_kpts, :2] = 0
+            this_kpts[:, 2] = visible_kpts
+            image_kpts.append(this_kpts)
+    image_kpts = np.array(image_kpts)
+    image_bboxes = np.array(image_bboxes)
+    num_valid_kpts = np.array(num_valid_kpts)
+    image_kpts_backup = image_kpts.copy()
+    # Prepare keypoints such that all instances have the same number of keypoints
+    # First sort keypoints by their distance to the center of the bounding box
+    # If some are missing, duplicate the last one
+    prepared_kpts = []
+    prepared_kpts_backup = []
+    for bbox, kpts, num_visible in zip(image_bboxes, image_kpts, num_valid_kpts):
+        this_kpts, this_conf = _select_keypoints(sam_args, kpts, num_visible, bbox)
+        # Duplicate the last keypoint if some are missing
+        if this_kpts.shape[0] < num_valid_kpts.max():
+            this_kpts = np.concatenate(
+                [this_kpts, np.tile(this_kpts[-1], (num_valid_kpts.max() - this_kpts.shape[0], 1))], axis=0
+            )
+            this_conf = np.concatenate(
+                [this_conf, np.tile(this_conf[-1], (num_valid_kpts.max() - this_conf.shape[0],))], axis=0
+            )
+        prepared_kpts.append(this_kpts)
+        prepared_kpts_backup.append(np.concatenate([this_kpts, this_conf[:, None]], axis=1))
+    image_kpts = np.array(prepared_kpts)
+    image_kpts_backup = np.array(prepared_kpts_backup)
+    kpts_labels = np.ones(image_kpts.shape[:2])
+    # Compute IoUs between all bounding boxes
+    max_ious = _get_max_ious(image_bboxes)
+    num_pos_keypoints = sam_args.num_pos_keypoints
+    use_bbox = sam_args.use_bbox
+    if sam_args.crowd_by_max_iou is not None and max_ious[instance_idx] > sam_args.crowd_by_max_iou:
+        use_bbox = False
+        num_pos_keypoints = sam_args.num_pos_keypoints_if_crowd
+    # Threshold the number of positive keypoints
+    if num_pos_keypoints > 0 and num_pos_keypoints < image_kpts.shape[1]:
+        image_kpts = image_kpts[:, :num_pos_keypoints, :]
+        kpts_labels = kpts_labels[:, :num_pos_keypoints]
+        image_kpts_backup = image_kpts_backup[:, :num_pos_keypoints, :]
+    elif num_pos_keypoints == 0:
+        image_kpts = None
+        kpts_labels = None
+        image_kpts_backup = np.empty((0, 3), dtype=np.float32)
+    image_bboxes_xyxy = None
+    if use_bbox:
+        image_bboxes_xyxy = np.array(image_bboxes)
+        image_bboxes_xyxy[:, 2:] += image_bboxes_xyxy[:, :2]
+        # Expand the bbox to include the positive keypoints
+        if sam_args.extend_bbox:
+            pose_bbox = np.stack(
+                [
+                    np.min(image_kpts[:, :, 0], axis=1) - 2,
+                    np.min(image_kpts[:, :, 1], axis=1) - 2,
+                    np.max(image_kpts[:, :, 0], axis=1) + 2,
+                    np.max(image_kpts[:, :, 1], axis=1) + 2,
+                ],
+                axis=1,
+            )
+            expanded_bbox = np.array(image_bboxes_xyxy)
+            expanded_bbox[:, :2] = np.minimum(expanded_bbox[:, :2], pose_bbox[:, :2])
+            expanded_bbox[:, 2:] = np.maximum(expanded_bbox[:, 2:], pose_bbox[:, 2:])
+            # bbox_expanded = (np.abs(expanded_bbox - image_bboxes_xyxy) > 1e-4).any(axis=1)
+            image_bboxes_xyxy = expanded_bbox
+    # Process even old detections to get their 'negative' keypoints
+    masks, scores, logits = model.predict(
+        point_coords=image_kpts,
+        point_labels=kpts_labels,
+        box=image_bboxes_xyxy,
+        multimask_output=False,
+    )
+    # Reshape the masks to (N, C, H, W). If the model outputs (C, H, W), add a number of masks dimension
+    if len(masks.shape) == 3:
+        masks = masks[None, :, :, :]
+    masks = masks[:, 0, :, :]
+    N = masks.shape[0]
+    scores = scores.reshape(N)
+    if sam_args.exclusive_masks and N > 1:
+        # Make sure the masks are non-overlapping
+        # If two masks overlap, set the pixel to the one with the highest score
+        masks = _apply_exclusive_masks(masks, scores)
+    gt_masks = new_dets.pred_masks.copy() if new_dets.pred_masks is not None else None
+    if sam_args.pose_mask_consistency and gt_masks is not None:
+        # Measure 'mask-pose_conistency' by computing number of keypoints inside the mask
+        # Compute for both gt (if available) and predicted masks and then choose the one with higher consistency
+        dt_mask_pose_consistency = _compute_mask_pose_consistency(masks, image_kpts_backup)
+        gt_mask_pose_consistency = _compute_mask_pose_consistency(gt_masks, image_kpts_backup)
+        dt_masks_area = np.array([m.sum() for m in masks])
+        gt_masks_area = np.array([m.sum() for m in gt_masks]) if gt_masks is not None else np.zeros_like(dt_masks_area)
+        # If PM-c is approx the same, prefer the smaller mask
+        tol = 0.1
+        pmc_is_equal = np.isclose(dt_mask_pose_consistency, gt_mask_pose_consistency, atol=tol)
+        dt_is_worse = (dt_mask_pose_consistency < (gt_mask_pose_consistency - tol)) | pmc_is_equal & (
+            dt_masks_area > gt_masks_area
+        )
+        new_masks = []
+        for dt_mask, gt_mask, dt_worse in zip(masks, gt_masks, dt_is_worse):
+            if dt_worse:
+                new_masks.append(gt_mask)
+            else:
+                new_masks.append(dt_mask)
+        masks = np.array(new_masks)
+    new_dets.refined_masks = masks[:n_new_dets]
+    new_dets.sam_scores = scores[:n_new_dets]
+    new_dets.sam_kpts = image_kpts_backup[:n_new_dets]
+    return new_dets
+def _apply_exclusive_masks(masks: np.ndarray, scores: np.ndarray) -> np.ndarray:
+    """
+    Ensure masks are non-overlapping by keeping at each pixel the mask with the highest score.
+    """
+    no_mask = masks.sum(axis=0) == 0
+    masked_scores = masks * scores[:, None, None]
+    argmax_masks = np.argmax(masked_scores, axis=0)
+    new_masks = argmax_masks[None, :, :] == (np.arange(masks.shape[0])[:, None, None])
+    new_masks[:, no_mask] = 0
+    return new_masks

mmpose/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmengine
+from mmengine.utils import digit_version
+from .version import __version__, short_version
+mmcv_minimum_version = '2.0.0rc4'
+mmcv_maximum_version = '2.3.0'
+mmcv_version = digit_version(mmcv.__version__)
+mmengine_minimum_version = '0.6.0'
+mmengine_maximum_version = '1.0.0'
+mmengine_version = digit_version(mmengine.__version__)
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+assert (mmengine_version >= digit_version(mmengine_minimum_version)
+        and mmengine_version <= digit_version(mmengine_maximum_version)), \
+    f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+    f'Please install mmengine>={mmengine_minimum_version}, ' \
+    f'<={mmengine_maximum_version}.'
+__all__ = ['__version__', 'short_version']

mmpose/apis/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .inference import (collect_multi_frames, inference_bottomup,
+                        inference_topdown, init_model)
+from .inference_3d import (collate_pose_sequence, convert_keypoint_definition,
+                           extract_pose_sequence, inference_pose_lifter_model)
+from .inference_tracking import _compute_iou, _track_by_iou, _track_by_oks
+from .inferencers import MMPoseInferencer, Pose2DInferencer
+from .visualization import visualize
+__all__ = [
+    'init_model', 'inference_topdown', 'inference_bottomup',
+    'collect_multi_frames', 'Pose2DInferencer', 'MMPoseInferencer',
+    '_track_by_iou', '_track_by_oks', '_compute_iou',
+    'inference_pose_lifter_model', 'extract_pose_sequence',
+    'convert_keypoint_definition', 'collate_pose_sequence', 'visualize'
+]

mmpose/apis/inference.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from pathlib import Path
+from typing import List, Optional, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from mmengine.config import Config
+from mmengine.dataset import Compose, pseudo_collate
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from PIL import Image
+from mmpose.datasets.datasets.utils import parse_pose_metainfo
+from mmpose.models.builder import build_pose_estimator
+from mmpose.structures import PoseDataSample
+from mmpose.structures.bbox import bbox_xywh2xyxy
+import cv2
+def dataset_meta_from_config(config: Config,
+                             dataset_mode: str = 'train') -> Optional[dict]:
+    """Get dataset metainfo from the model config.
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        dataset_mode (str): Specify the dataset of which to get the metainfo.
+            Options are ``'train'``, ``'val'`` and ``'test'``. Defaults to
+            ``'train'``
+    Returns:
+        dict, optional: The dataset metainfo. See
+        ``mmpose.datasets.datasets.utils.parse_pose_metainfo`` for details.
+        Return ``None`` if failing to get dataset metainfo from the config.
+    """
+    try:
+        if dataset_mode == 'train':
+            dataset_cfg = config.train_dataloader.dataset
+        elif dataset_mode == 'val':
+            dataset_cfg = config.val_dataloader.dataset
+        elif dataset_mode == 'test':
+            dataset_cfg = config.test_dataloader.dataset
+        else:
+            raise ValueError(
+                f'Invalid dataset {dataset_mode} to get metainfo. '
+                'Should be one of "train", "val", or "test".')
+        if 'metainfo' in dataset_cfg:
+            metainfo = dataset_cfg.metainfo
+        else:
+            import mmpose.datasets.datasets  # noqa: F401, F403
+            from mmpose.registry import DATASETS
+            dataset_class = dataset_cfg.type if isinstance(
+                dataset_cfg.type, type) else DATASETS.get(dataset_cfg.type)
+            metainfo = dataset_class.METAINFO
+        metainfo = parse_pose_metainfo(metainfo)
+    except AttributeError:
+        metainfo = None
+    return metainfo
+def init_model(config: Union[str, Path, Config],
+               checkpoint: Optional[str] = None,
+               device: str = 'cuda:0',
+               cfg_options: Optional[dict] = None) -> nn.Module:
+    """Initialize a pose estimator from a config file.
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights. Defaults to ``None``
+        device (str): The device where the anchors will be put on.
+            Defaults to ``'cuda:0'``.
+        cfg_options (dict, optional): Options to override some settings in
+            the used config. Defaults to ``None``
+    Returns:
+        nn.Module: The constructed pose estimator.
+    """
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    elif 'init_cfg' in config.model.backbone:
+        config.model.backbone.init_cfg = None
+    config.model.train_cfg = None
+    # register all modules in mmpose into the registries
+    scope = config.get('default_scope', 'mmpose')
+    if scope is not None:
+        init_default_scope(scope)
+    model = build_pose_estimator(config.model)
+    model = revert_sync_batchnorm(model)
+    # get dataset_meta in this priority: checkpoint > config > default (COCO)
+    dataset_meta = None
+    if checkpoint is not None:
+        ckpt = load_checkpoint(model, checkpoint, map_location='cpu')
+        if 'dataset_meta' in ckpt.get('meta', {}):
+            # checkpoint from mmpose 1.x
+            dataset_meta = ckpt['meta']['dataset_meta']
+    if dataset_meta is None:
+        dataset_meta = dataset_meta_from_config(config, dataset_mode='train')
+    if dataset_meta is None:
+        warnings.simplefilter('once')
+        warnings.warn('Can not load dataset_meta from the checkpoint or the '
+                      'model config. Use COCO metainfo by default.')
+        dataset_meta = parse_pose_metainfo(
+            dict(from_file='configs/_base_/datasets/coco.py'))
+    model.dataset_meta = dataset_meta
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
+def inference_topdown(model: nn.Module,
+                      img: Union[np.ndarray, str],
+                      bboxes: Optional[Union[List, np.ndarray]] = None,
+                      masks: Optional[Union[List, np.ndarray]] = None,
+                      bbox_format: str = 'xyxy') -> List[PoseDataSample]:
+    """Inference image with a top-down pose estimator.
+    Args:
+        model (nn.Module): The top-down pose estimator
+        img (np.ndarray | str): The loaded image or image file to inference
+        bboxes (np.ndarray, optional): The bboxes in shape (N, 4), each row
+            represents a bbox. If not given, the entire image will be regarded
+            as a single bbox area. Defaults to ``None``
+        bbox_format (str): The bbox format indicator. Options are ``'xywh'``
+            and ``'xyxy'``. Defaults to ``'xyxy'``
+    Returns:
+        List[:obj:`PoseDataSample`]: The inference results. Specifically, the
+        predicted keypoints and scores are saved at
+        ``data_sample.pred_instances.keypoints`` and
+        ``data_sample.pred_instances.keypoint_scores``.
+    """
+    scope = model.cfg.get('default_scope', 'mmpose')
+    if scope is not None:
+        init_default_scope(scope)
+    pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline)
+    if bboxes is None or len(bboxes) == 0:
+        # get bbox from the image size
+        if isinstance(img, str):
+            w, h = Image.open(img).size
+        else:
+            h, w = img.shape[:2]
+        bboxes = np.array([[0, 0, w, h]], dtype=np.float32)
+    else:
+        if isinstance(bboxes, list):
+            bboxes = np.array(bboxes)
+        assert bbox_format in {'xyxy', 'xywh'}, \
+            f'Invalid bbox_format "{bbox_format}".'
+        if bbox_format == 'xywh':
+            bboxes = bbox_xywh2xyxy(bboxes)
+    if masks is None or len(masks) == 0:
+        masks = np.zeros((bboxes.shape[0], img.shape[0], img.shape[1]),
+                         dtype=np.uint8)
+    # Masks are expected in polygon format
+    poly_masks = []
+    for mask in masks:
+        if np.sum(mask) == 0:
+            poly_masks.append(None)
+        else:
+            contours, _ = cv2.findContours((mask*255).astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            polygons = [contour.flatten() for contour in contours if len(contour) > 3]
+            poly_masks.append(polygons if polygons else None)
+    # construct batch data samples
+    data_list = []
+    for bbox, pmask in zip(bboxes, poly_masks):
+        if isinstance(img, str):
+            data_info = dict(img_path=img)
+        else:
+            data_info = dict(img=img)
+        data_info['bbox'] = bbox[None]  # shape (1, 4)
+        data_info['segmentation'] = pmask
+        data_info['bbox_score'] = np.ones(1, dtype=np.float32)  # shape (1,)
+        data_info.update(model.dataset_meta)
+        data_list.append(pipeline(data_info))
+    if data_list:
+        # collate data list into a batch, which is a dict with following keys:
+        # batch['inputs']: a list of input images
+        # batch['data_samples']: a list of :obj:`PoseDataSample`
+        batch = pseudo_collate(data_list)
+        with torch.no_grad():
+            results = model.test_step(batch)
+    else:
+        results = []
+    return results
+def inference_bottomup(model: nn.Module, img: Union[np.ndarray, str]):
+    """Inference image with a bottom-up pose estimator.
+    Args:
+        model (nn.Module): The bottom-up pose estimator
+        img (np.ndarray | str): The loaded image or image file to inference
+    Returns:
+        List[:obj:`PoseDataSample`]: The inference results. Specifically, the
+        predicted keypoints and scores are saved at
+        ``data_sample.pred_instances.keypoints`` and
+        ``data_sample.pred_instances.keypoint_scores``.
+    """
+    pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline)
+    # prepare data batch
+    if isinstance(img, str):
+        data_info = dict(img_path=img)
+    else:
+        data_info = dict(img=img)
+    data_info.update(model.dataset_meta)
+    data = pipeline(data_info)
+    batch = pseudo_collate([data])
+    with torch.no_grad():
+        results = model.test_step(batch)
+    return results
+def collect_multi_frames(video, frame_id, indices, online=False):
+    """Collect multi frames from the video.
+    Args:
+        video (mmcv.VideoReader): A VideoReader of the input video file.
+        frame_id (int): index of the current frame
+        indices (list(int)): index offsets of the frames to collect
+        online (bool): inference mode, if set to True, can not use future
+            frame information.
+    Returns:
+        list(ndarray): multi frames collected from the input video file.
+    """
+    num_frames = len(video)
+    frames = []
+    # put the current frame at first
+    frames.append(video[frame_id])
+    # use multi frames for inference
+    for idx in indices:
+        # skip current frame
+        if idx == 0:
+            continue
+        support_idx = frame_id + idx
+        # online mode, can not use future frame information
+        if online:
+            support_idx = np.clip(support_idx, 0, frame_id)
+        else:
+            support_idx = np.clip(support_idx, 0, num_frames - 1)
+        frames.append(video[support_idx])
+    return frames

mmpose/apis/inference_3d.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmengine.dataset import Compose, pseudo_collate
+from mmengine.registry import init_default_scope
+from mmengine.structures import InstanceData
+from mmpose.structures import PoseDataSample
+def convert_keypoint_definition(keypoints, pose_det_dataset,
+                                pose_lift_dataset):
+    """Convert pose det dataset keypoints definition to pose lifter dataset
+    keypoints definition, so that they are compatible with the definitions
+    required for 3D pose lifting.
+    Args:
+        keypoints (ndarray[N, K, 2 or 3]): 2D keypoints to be transformed.
+        pose_det_dataset, (str): Name of the dataset for 2D pose detector.
+        pose_lift_dataset (str): Name of the dataset for pose lifter model.
+    Returns:
+        ndarray[K, 2 or 3]: the transformed 2D keypoints.
+    """
+    assert pose_lift_dataset in [
+        'h36m', 'h3wb'], '`pose_lift_dataset` should be ' \
+        f'`h36m`, but got {pose_lift_dataset}.'
+    keypoints_new = np.zeros((keypoints.shape[0], 17, keypoints.shape[2]),
+                             dtype=keypoints.dtype)
+    if pose_lift_dataset in ['h36m', 'h3wb']:
+        if pose_det_dataset in ['h36m', 'coco_wholebody']:
+            keypoints_new = keypoints
+        elif pose_det_dataset in ['coco', 'posetrack18']:
+            # pelvis (root) is in the middle of l_hip and r_hip
+            keypoints_new[:, 0] = (keypoints[:, 11] + keypoints[:, 12]) / 2
+            # thorax is in the middle of l_shoulder and r_shoulder
+            keypoints_new[:, 8] = (keypoints[:, 5] + keypoints[:, 6]) / 2
+            # spine is in the middle of thorax and pelvis
+            keypoints_new[:,
+                          7] = (keypoints_new[:, 0] + keypoints_new[:, 8]) / 2
+            # in COCO, head is in the middle of l_eye and r_eye
+            # in PoseTrack18, head is in the middle of head_bottom and head_top
+            keypoints_new[:, 10] = (keypoints[:, 1] + keypoints[:, 2]) / 2
+            # rearrange other keypoints
+            keypoints_new[:, [1, 2, 3, 4, 5, 6, 9, 11, 12, 13, 14, 15, 16]] = \
+                keypoints[:, [12, 14, 16, 11, 13, 15, 0, 5, 7, 9, 6, 8, 10]]
+        elif pose_det_dataset in ['aic']:
+            # pelvis (root) is in the middle of l_hip and r_hip
+            keypoints_new[:, 0] = (keypoints[:, 9] + keypoints[:, 6]) / 2
+            # thorax is in the middle of l_shoulder and r_shoulder
+            keypoints_new[:, 8] = (keypoints[:, 3] + keypoints[:, 0]) / 2
+            # spine is in the middle of thorax and pelvis
+            keypoints_new[:,
+                          7] = (keypoints_new[:, 0] + keypoints_new[:, 8]) / 2
+            # neck base (top end of neck) is 1/4 the way from
+            # neck (bottom end of neck) to head top
+            keypoints_new[:, 9] = (3 * keypoints[:, 13] + keypoints[:, 12]) / 4
+            # head (spherical centre of head) is 7/12 the way from
+            # neck (bottom end of neck) to head top
+            keypoints_new[:, 10] = (5 * keypoints[:, 13] +
+                                    7 * keypoints[:, 12]) / 12
+            keypoints_new[:, [1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16]] = \
+                keypoints[:, [6, 7, 8, 9, 10, 11, 3, 4, 5, 0, 1, 2]]
+        elif pose_det_dataset in ['crowdpose']:
+            # pelvis (root) is in the middle of l_hip and r_hip
+            keypoints_new[:, 0] = (keypoints[:, 6] + keypoints[:, 7]) / 2
+            # thorax is in the middle of l_shoulder and r_shoulder
+            keypoints_new[:, 8] = (keypoints[:, 0] + keypoints[:, 1]) / 2
+            # spine is in the middle of thorax and pelvis
+            keypoints_new[:,
+                          7] = (keypoints_new[:, 0] + keypoints_new[:, 8]) / 2
+            # neck base (top end of neck) is 1/4 the way from
+            # neck (bottom end of neck) to head top
+            keypoints_new[:, 9] = (3 * keypoints[:, 13] + keypoints[:, 12]) / 4
+            # head (spherical centre of head) is 7/12 the way from
+            # neck (bottom end of neck) to head top
+            keypoints_new[:, 10] = (5 * keypoints[:, 13] +
+                                    7 * keypoints[:, 12]) / 12
+            keypoints_new[:, [1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16]] = \
+                keypoints[:, [7, 9, 11, 6, 8, 10, 0, 2, 4, 1, 3, 5]]
+        else:
+            raise NotImplementedError(
+                f'unsupported conversion between {pose_lift_dataset} and '
+                f'{pose_det_dataset}')
+    return keypoints_new
+def extract_pose_sequence(pose_results, frame_idx, causal, seq_len, step=1):
+    """Extract the target frame from 2D pose results, and pad the sequence to a
+    fixed length.
+    Args:
+        pose_results (List[List[:obj:`PoseDataSample`]]): Multi-frame pose
+            detection results stored in a list.
+        frame_idx (int): The index of the frame in the original video.
+        causal (bool): If True, the target frame is the last frame in
+            a sequence. Otherwise, the target frame is in the middle of
+            a sequence.
+        seq_len (int): The number of frames in the input sequence.
+        step (int): Step size to extract frames from the video.
+    Returns:
+        List[List[:obj:`PoseDataSample`]]: Multi-frame pose detection results
+            stored in a nested list with a length of seq_len.
+    """
+    if causal:
+        frames_left = seq_len - 1
+        frames_right = 0
+    else:
+        frames_left = (seq_len - 1) // 2
+        frames_right = frames_left
+    num_frames = len(pose_results)
+    # get the padded sequence
+    pad_left = max(0, frames_left - frame_idx // step)
+    pad_right = max(0, frames_right - (num_frames - 1 - frame_idx) // step)
+    start = max(frame_idx % step, frame_idx - frames_left * step)
+    end = min(num_frames - (num_frames - 1 - frame_idx) % step,
+              frame_idx + frames_right * step + 1)
+    pose_results_seq = [pose_results[0]] * pad_left + \
+        pose_results[start:end:step] + [pose_results[-1]] * pad_right
+    return pose_results_seq
+def collate_pose_sequence(pose_results_2d,
+                          with_track_id=True,
+                          target_frame=-1):
+    """Reorganize multi-frame pose detection results into individual pose
+    sequences.
+    Note:
+        - The temporal length of the pose detection results: T
+        - The number of the person instances: N
+        - The number of the keypoints: K
+        - The channel number of each keypoint: C
+    Args:
+        pose_results_2d (List[List[:obj:`PoseDataSample`]]): Multi-frame pose
+            detection results stored in a nested list. Each element of the
+            outer list is the pose detection results of a single frame, and
+            each element of the inner list is the pose information of one
+            person, which contains:
+                - keypoints (ndarray[K, 2 or 3]): x, y, [score]
+                - track_id (int): unique id of each person, required when
+                    ``with_track_id==True```
+        with_track_id (bool): If True, the element in pose_results is expected
+            to contain "track_id", which will be used to gather the pose
+            sequence of a person from multiple frames. Otherwise, the pose
+            results in each frame are expected to have a consistent number and
+            order of identities. Default is True.
+        target_frame (int): The index of the target frame. Default: -1.
+    Returns:
+        List[:obj:`PoseDataSample`]: Indivisual pose sequence in with length N.
+    """
+    T = len(pose_results_2d)
+    assert T > 0
+    target_frame = (T + target_frame) % T  # convert negative index to positive
+    N = len(
+        pose_results_2d[target_frame])  # use identities in the target frame
+    if N == 0:
+        return []
+    B, K, C = pose_results_2d[target_frame][0].pred_instances.keypoints.shape
+    track_ids = None
+    if with_track_id:
+        track_ids = [res.track_id for res in pose_results_2d[target_frame]]
+    pose_sequences = []
+    for idx in range(N):
+        pose_seq = PoseDataSample()
+        pred_instances = InstanceData()
+        gt_instances = pose_results_2d[target_frame][idx].gt_instances.clone()
+        pred_instances = pose_results_2d[target_frame][
+            idx].pred_instances.clone()
+        pose_seq.pred_instances = pred_instances
+        pose_seq.gt_instances = gt_instances
+        if not with_track_id:
+            pose_seq.pred_instances.keypoints = np.stack([
+                frame[idx].pred_instances.keypoints
+                for frame in pose_results_2d
+            ],
+                                                         axis=1)
+        else:
+            keypoints = np.zeros((B, T, K, C), dtype=np.float32)
+            keypoints[:, target_frame] = pose_results_2d[target_frame][
+                idx].pred_instances.keypoints
+            # find the left most frame containing track_ids[idx]
+            for frame_idx in range(target_frame - 1, -1, -1):
+                contains_idx = False
+                for res in pose_results_2d[frame_idx]:
+                    if res.track_id == track_ids[idx]:
+                        keypoints[:, frame_idx] = res.pred_instances.keypoints
+                        contains_idx = True
+                        break
+                if not contains_idx:
+                    # replicate the left most frame
+                    keypoints[:, :frame_idx + 1] = keypoints[:, frame_idx + 1]
+                    break
+            # find the right most frame containing track_idx[idx]
+            for frame_idx in range(target_frame + 1, T):
+                contains_idx = False
+                for res in pose_results_2d[frame_idx]:
+                    if res.track_id == track_ids[idx]:
+                        keypoints[:, frame_idx] = res.pred_instances.keypoints
+                        contains_idx = True
+                        break
+                if not contains_idx:
+                    # replicate the right most frame
+                    keypoints[:, frame_idx + 1:] = keypoints[:, frame_idx]
+                    break
+            pose_seq.pred_instances.set_field(keypoints, 'keypoints')
+        pose_sequences.append(pose_seq)
+    return pose_sequences
+def inference_pose_lifter_model(model,
+                                pose_results_2d,
+                                with_track_id=True,
+                                image_size=None,
+                                norm_pose_2d=False):
+    """Inference 3D pose from 2D pose sequences using a pose lifter model.
+    Args:
+        model (nn.Module): The loaded pose lifter model
+        pose_results_2d (List[List[:obj:`PoseDataSample`]]): The 2D pose
+            sequences stored in a nested list.
+        with_track_id: If True, the element in pose_results_2d is expected to
+            contain "track_id", which will be used to gather the pose sequence
+            of a person from multiple frames. Otherwise, the pose results in
+            each frame are expected to have a consistent number and order of
+            identities. Default is True.
+        image_size (tuple|list): image width, image height. If None, image size
+            will not be contained in dict ``data``.
+        norm_pose_2d (bool): If True, scale the bbox (along with the 2D
+            pose) to the average bbox scale of the dataset, and move the bbox
+            (along with the 2D pose) to the average bbox center of the dataset.
+    Returns:
+        List[:obj:`PoseDataSample`]: 3D pose inference results. Specifically,
+        the predicted keypoints and scores are saved at
+        ``data_sample.pred_instances.keypoints_3d``.
+    """
+    init_default_scope(model.cfg.get('default_scope', 'mmpose'))
+    pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline)
+    causal = model.cfg.test_dataloader.dataset.get('causal', False)
+    target_idx = -1 if causal else len(pose_results_2d) // 2
+    dataset_info = model.dataset_meta
+    if dataset_info is not None:
+        if 'stats_info' in dataset_info:
+            bbox_center = dataset_info['stats_info']['bbox_center']
+            bbox_scale = dataset_info['stats_info']['bbox_scale']
+        else:
+            if norm_pose_2d:
+                # compute the average bbox center and scale from the
+                # datasamples in pose_results_2d
+                bbox_center = np.zeros((1, 2), dtype=np.float32)
+                bbox_scale = 0
+                num_bbox = 0
+                for pose_res in pose_results_2d:
+                    for data_sample in pose_res:
+                        for bbox in data_sample.pred_instances.bboxes:
+                            bbox_center += np.array([[(bbox[0] + bbox[2]) / 2,
+                                                      (bbox[1] + bbox[3]) / 2]
+                                                     ])
+                            bbox_scale += max(bbox[2] - bbox[0],
+                                              bbox[3] - bbox[1])
+                            num_bbox += 1
+                bbox_center /= num_bbox
+                bbox_scale /= num_bbox
+            else:
+                bbox_center = None
+                bbox_scale = None
+    pose_results_2d_copy = []
+    for i, pose_res in enumerate(pose_results_2d):
+        pose_res_copy = []
+        for j, data_sample in enumerate(pose_res):
+            data_sample_copy = PoseDataSample()
+            data_sample_copy.gt_instances = data_sample.gt_instances.clone()
+            data_sample_copy.pred_instances = data_sample.pred_instances.clone(
+            )
+            data_sample_copy.track_id = data_sample.track_id
+            kpts = data_sample.pred_instances.keypoints
+            bboxes = data_sample.pred_instances.bboxes
+            keypoints = []
+            for k in range(len(kpts)):
+                kpt = kpts[k]
+                if norm_pose_2d:
+                    bbox = bboxes[k]
+                    center = np.array([[(bbox[0] + bbox[2]) / 2,
+                                        (bbox[1] + bbox[3]) / 2]])
+                    scale = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
+                    keypoints.append((kpt[:, :2] - center) / scale *
+                                     bbox_scale + bbox_center)
+                else:
+                    keypoints.append(kpt[:, :2])
+            data_sample_copy.pred_instances.set_field(
+                np.array(keypoints), 'keypoints')
+            pose_res_copy.append(data_sample_copy)
+        pose_results_2d_copy.append(pose_res_copy)
+    pose_sequences_2d = collate_pose_sequence(pose_results_2d_copy,
+                                              with_track_id, target_idx)
+    if not pose_sequences_2d:
+        return []
+    data_list = []
+    for i, pose_seq in enumerate(pose_sequences_2d):
+        data_info = dict()
+        keypoints_2d = pose_seq.pred_instances.keypoints
+        keypoints_2d = np.squeeze(
+            keypoints_2d, axis=0) if keypoints_2d.ndim == 4 else keypoints_2d
+        T, K, C = keypoints_2d.shape
+        data_info['keypoints'] = keypoints_2d
+        data_info['keypoints_visible'] = np.ones((
+            T,
+            K,
+        ), dtype=np.float32)
+        data_info['lifting_target'] = np.zeros((1, K, 3), dtype=np.float32)
+        data_info['factor'] = np.zeros((T, ), dtype=np.float32)
+        data_info['lifting_target_visible'] = np.ones((1, K, 1),
+                                                      dtype=np.float32)
+        if image_size is not None:
+            assert len(image_size) == 2
+            data_info['camera_param'] = dict(w=image_size[0], h=image_size[1])
+        data_info.update(model.dataset_meta)
+        data_list.append(pipeline(data_info))
+    if data_list:
+        # collate data list into a batch, which is a dict with following keys:
+        # batch['inputs']: a list of input images
+        # batch['data_samples']: a list of :obj:`PoseDataSample`
+        batch = pseudo_collate(data_list)
+        with torch.no_grad():
+            results = model.test_step(batch)
+    else:
+        results = []
+    return results

mmpose/apis/inference_tracking.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+import numpy as np
+from mmpose.evaluation.functional.nms import oks_iou
+def _compute_iou(bboxA, bboxB):
+    """Compute the Intersection over Union (IoU) between two boxes .
+    Args:
+        bboxA (list): The first bbox info (left, top, right, bottom, score).
+        bboxB (list): The second bbox info (left, top, right, bottom, score).
+    Returns:
+        float: The IoU value.
+    """
+    x1 = max(bboxA[0], bboxB[0])
+    y1 = max(bboxA[1], bboxB[1])
+    x2 = min(bboxA[2], bboxB[2])
+    y2 = min(bboxA[3], bboxB[3])
+    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
+    bboxA_area = (bboxA[2] - bboxA[0]) * (bboxA[3] - bboxA[1])
+    bboxB_area = (bboxB[2] - bboxB[0]) * (bboxB[3] - bboxB[1])
+    union_area = float(bboxA_area + bboxB_area - inter_area)
+    if union_area == 0:
+        union_area = 1e-5
+        warnings.warn('union_area=0 is unexpected')
+    iou = inter_area / union_area
+    return iou
+def _track_by_iou(res, results_last, thr):
+    """Get track id using IoU tracking greedily."""
+    bbox = list(np.squeeze(res.pred_instances.bboxes, axis=0))
+    max_iou_score = -1
+    max_index = -1
+    match_result = {}
+    for index, res_last in enumerate(results_last):
+        bbox_last = list(np.squeeze(res_last.pred_instances.bboxes, axis=0))
+        iou_score = _compute_iou(bbox, bbox_last)
+        if iou_score > max_iou_score:
+            max_iou_score = iou_score
+            max_index = index
+    if max_iou_score > thr:
+        track_id = results_last[max_index].track_id
+        match_result = results_last[max_index]
+        del results_last[max_index]
+    else:
+        track_id = -1
+    return track_id, results_last, match_result
+def _track_by_oks(res, results_last, thr, sigmas=None):
+    """Get track id using OKS tracking greedily."""
+    keypoint = np.concatenate((res.pred_instances.keypoints,
+                               res.pred_instances.keypoint_scores[:, :, None]),
+                              axis=2)
+    keypoint = np.squeeze(keypoint, axis=0).reshape((-1))
+    area = np.squeeze(res.pred_instances.areas, axis=0)
+    max_index = -1
+    match_result = {}
+    if len(results_last) == 0:
+        return -1, results_last, match_result
+    keypoints_last = np.array([
+        np.squeeze(
+            np.concatenate(
+                (res_last.pred_instances.keypoints,
+                 res_last.pred_instances.keypoint_scores[:, :, None]),
+                axis=2),
+            axis=0).reshape((-1)) for res_last in results_last
+    ])
+    area_last = np.array([
+        np.squeeze(res_last.pred_instances.areas, axis=0)
+        for res_last in results_last
+    ])
+    oks_score = oks_iou(
+        keypoint, keypoints_last, area, area_last, sigmas=sigmas)
+    max_index = np.argmax(oks_score)
+    if oks_score[max_index] > thr:
+        track_id = results_last[max_index].track_id
+        match_result = results_last[max_index]
+        del results_last[max_index]
+    else:
+        track_id = -1
+    return track_id, results_last, match_result

mmpose/apis/inferencers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .hand3d_inferencer import Hand3DInferencer
+from .mmpose_inferencer import MMPoseInferencer
+from .pose2d_inferencer import Pose2DInferencer
+from .pose3d_inferencer import Pose3DInferencer
+from .utils import get_model_aliases
+__all__ = [
+    'Pose2DInferencer', 'MMPoseInferencer', 'get_model_aliases',
+    'Pose3DInferencer', 'Hand3DInferencer'
+]

mmpose/apis/inferencers/base_mmpose_inferencer.py ADDED Viewed

	@@ -0,0 +1,691 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+import mimetypes
+import os
+from collections import defaultdict
+from typing import (Callable, Dict, Generator, Iterable, List, Optional,
+                    Sequence, Tuple, Union)
+import cv2
+import mmcv
+import mmengine
+import numpy as np
+import torch.nn as nn
+from mmengine.config import Config, ConfigDict
+from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
+from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.logging import print_log
+from mmengine.registry import init_default_scope
+from mmengine.runner.checkpoint import _load_checkpoint_to_model
+from mmengine.structures import InstanceData
+from mmengine.utils import mkdir_or_exist
+from rich.progress import track
+from mmpose.apis.inference import dataset_meta_from_config
+from mmpose.registry import DATASETS
+from mmpose.structures import PoseDataSample, split_instances
+from .utils import default_det_models
+try:
+    from mmdet.apis.det_inferencer import DetInferencer
+    has_mmdet = True
+except (ImportError, ModuleNotFoundError):
+    has_mmdet = False
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ConfigType = Union[Config, ConfigDict]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+class BaseMMPoseInferencer(BaseInferencer):
+    """The base class for MMPose inferencers."""
+    preprocess_kwargs: set = {'bbox_thr', 'nms_thr', 'bboxes'}
+    forward_kwargs: set = set()
+    visualize_kwargs: set = {
+        'return_vis', 'show', 'wait_time', 'draw_bbox', 'radius', 'thickness',
+        'kpt_thr', 'vis_out_dir', 'black_background'
+    }
+    postprocess_kwargs: set = {'pred_out_dir', 'return_datasample'}
+    def __init__(self,
+                 model: Union[ModelType, str, None] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = None,
+                 show_progress: bool = False) -> None:
+        super().__init__(
+            model, weights, device, scope, show_progress=show_progress)
+    def _init_detector(
+        self,
+        det_model: Optional[Union[ModelType, str]] = None,
+        det_weights: Optional[str] = None,
+        det_cat_ids: Optional[Union[int, Tuple]] = None,
+        device: Optional[str] = None,
+    ):
+        object_type = DATASETS.get(self.cfg.dataset_type).__module__.split(
+            'datasets.')[-1].split('.')[0].lower()
+        if det_model in ('whole_image', 'whole-image') or \
+            (det_model is None and
+                object_type not in default_det_models):
+            self.detector = None
+        else:
+            det_scope = 'mmdet'
+            if det_model is None:
+                det_info = default_det_models[object_type]
+                det_model, det_weights, det_cat_ids = det_info[
+                    'model'], det_info['weights'], det_info['cat_ids']
+            elif os.path.exists(det_model):
+                det_cfg = Config.fromfile(det_model)
+                det_scope = det_cfg.default_scope
+            if has_mmdet:
+                det_kwargs = dict(
+                    model=det_model,
+                    weights=det_weights,
+                    device=device,
+                    scope=det_scope,
+                )
+                # for compatibility with low version of mmdet
+                if 'show_progress' in inspect.signature(
+                        DetInferencer).parameters:
+                    det_kwargs['show_progress'] = False
+                self.detector = DetInferencer(**det_kwargs)
+            else:
+                raise RuntimeError(
+                    'MMDetection (v3.0.0 or above) is required to build '
+                    'inferencers for top-down pose estimation models.')
+            if isinstance(det_cat_ids, (tuple, list)):
+                self.det_cat_ids = det_cat_ids
+            else:
+                self.det_cat_ids = (det_cat_ids, )
+    def _load_weights_to_model(self, model: nn.Module,
+                               checkpoint: Optional[dict],
+                               cfg: Optional[ConfigType]) -> None:
+        """Loading model weights and meta information from cfg and checkpoint.
+        Subclasses could override this method to load extra meta information
+        from ``checkpoint`` and ``cfg`` to model.
+        Args:
+            model (nn.Module): Model to load weights and meta information.
+            checkpoint (dict, optional): The loaded checkpoint.
+            cfg (Config or ConfigDict, optional): The loaded config.
+        """
+        if checkpoint is not None:
+            _load_checkpoint_to_model(model, checkpoint)
+            checkpoint_meta = checkpoint.get('meta', {})
+            # save the dataset_meta in the model for convenience
+            if 'dataset_meta' in checkpoint_meta:
+                # mmpose 1.x
+                model.dataset_meta = checkpoint_meta['dataset_meta']
+            else:
+                print_log(
+                    'dataset_meta are not saved in the checkpoint\'s '
+                    'meta data, load via config.',
+                    logger='current',
+                    level=logging.WARNING)
+                model.dataset_meta = dataset_meta_from_config(
+                    cfg, dataset_mode='train')
+        else:
+            print_log(
+                'Checkpoint is not loaded, and the inference '
+                'result is calculated by the randomly initialized '
+                'model!',
+                logger='current',
+                level=logging.WARNING)
+            model.dataset_meta = dataset_meta_from_config(
+                cfg, dataset_mode='train')
+    def _inputs_to_list(self, inputs: InputsType) -> Iterable:
+        """Preprocess the inputs to a list.
+        Preprocess inputs to a list according to its type:
+        - list or tuple: return inputs
+        - str:
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string
+              according to the task.
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        self._video_input = False
+        if isinstance(inputs, str):
+            backend = get_file_backend(inputs)
+            if hasattr(backend, 'isdir') and isdir(inputs):
+                # Backends like HttpsBackend do not implement `isdir`, so only
+                # those backends that implement `isdir` could accept the
+                # inputs as a directory
+                filepath_list = [
+                    join_path(inputs, fname)
+                    for fname in list_dir_or_file(inputs, list_dir=False)
+                ]
+                inputs = []
+                for filepath in filepath_list:
+                    input_type = mimetypes.guess_type(filepath)[0].split(
+                        '/')[0]
+                    if input_type == 'image':
+                        inputs.append(filepath)
+                inputs.sort()
+            else:
+                # if inputs is a path to a video file, it will be converted
+                # to a list containing separated frame filenames
+                input_type = mimetypes.guess_type(inputs)[0].split('/')[0]
+                if input_type == 'video':
+                    self._video_input = True
+                    video = mmcv.VideoReader(inputs)
+                    self.video_info = dict(
+                        fps=video.fps,
+                        name=os.path.basename(inputs),
+                        writer=None,
+                        width=video.width,
+                        height=video.height,
+                        predictions=[])
+                    inputs = video
+                elif input_type == 'image':
+                    inputs = [inputs]
+                else:
+                    raise ValueError(f'Expected input to be an image, video, '
+                                     f'or folder, but received {inputs} of '
+                                     f'type {input_type}.')
+        elif isinstance(inputs, np.ndarray):
+            inputs = [inputs]
+        return inputs
+    def _get_webcam_inputs(self, inputs: str) -> Generator:
+        """Sets up and returns a generator function that reads frames from a
+        webcam input. The generator function returns a new frame each time it
+        is iterated over.
+        Args:
+            inputs (str): A string describing the webcam input, in the format
+                "webcam:id".
+        Returns:
+            A generator function that yields frames from the webcam input.
+        Raises:
+            ValueError: If the inputs string is not in the expected format.
+        """
+        # Ensure the inputs string is in the expected format.
+        inputs = inputs.lower()
+        assert inputs.startswith('webcam'), f'Expected input to start with ' \
+            f'"webcam", but got "{inputs}"'
+        # Parse the camera ID from the inputs string.
+        inputs_ = inputs.split(':')
+        if len(inputs_) == 1:
+            camera_id = 0
+        elif len(inputs_) == 2 and str.isdigit(inputs_[1]):
+            camera_id = int(inputs_[1])
+        else:
+            raise ValueError(
+                f'Expected webcam input to have format "webcam:id", '
+                f'but got "{inputs}"')
+        # Attempt to open the video capture object.
+        vcap = cv2.VideoCapture(camera_id)
+        if not vcap.isOpened():
+            print_log(
+                f'Cannot open camera (ID={camera_id})',
+                logger='current',
+                level=logging.WARNING)
+            return []
+        # Set video input flag and metadata.
+        self._video_input = True
+        (major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')
+        if int(major_ver) < 3:
+            fps = vcap.get(cv2.cv.CV_CAP_PROP_FPS)
+            width = vcap.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)
+            height = vcap.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)
+        else:
+            fps = vcap.get(cv2.CAP_PROP_FPS)
+            width = vcap.get(cv2.CAP_PROP_FRAME_WIDTH)
+            height = vcap.get(cv2.CAP_PROP_FRAME_HEIGHT)
+        self.video_info = dict(
+            fps=fps,
+            name='webcam.mp4',
+            writer=None,
+            width=width,
+            height=height,
+            predictions=[])
+        def _webcam_reader() -> Generator:
+            while True:
+                if cv2.waitKey(5) & 0xFF == 27:
+                    vcap.release()
+                    break
+                ret_val, frame = vcap.read()
+                if not ret_val:
+                    break
+                yield frame
+        return _webcam_reader()
+    def _init_pipeline(self, cfg: ConfigType) -> Callable:
+        """Initialize the test pipeline.
+        Args:
+            cfg (ConfigType): model config path or dict
+        Returns:
+            A pipeline to handle various input data, such as ``str``,
+            ``np.ndarray``. The returned pipeline will be used to process
+            a single data.
+        """
+        scope = cfg.get('default_scope', 'mmpose')
+        if scope is not None:
+            init_default_scope(scope)
+        return Compose(cfg.test_dataloader.dataset.pipeline)
+    def update_model_visualizer_settings(self, **kwargs):
+        """Update the settings of models and visualizer according to inference
+        arguments."""
+        pass
+    def preprocess(self,
+                   inputs: InputsType,
+                   batch_size: int = 1,
+                   bboxes: Optional[List] = None,
+                   bbox_thr: float = 0.3,
+                   nms_thr: float = 0.3,
+                   **kwargs):
+        """Process the inputs into a model-feedable format.
+        Args:
+            inputs (InputsType): Inputs given by user.
+            batch_size (int): batch size. Defaults to 1.
+            bbox_thr (float): threshold for bounding box detection.
+                Defaults to 0.3.
+            nms_thr (float): IoU threshold for bounding box NMS.
+                Defaults to 0.3.
+        Yields:
+            Any: Data processed by the ``pipeline`` and ``collate_fn``.
+            List[str or np.ndarray]: List of original inputs in the batch
+        """
+        # One-stage pose estimators perform prediction filtering within the
+        # head's `predict` method. Here, we set the arguments for filtering
+        if self.cfg.model.type == 'BottomupPoseEstimator':
+            # 1. init with default arguments
+            test_cfg = self.model.head.test_cfg.copy()
+            # 2. update the score_thr and nms_thr in the test_cfg of the head
+            if 'score_thr' in test_cfg:
+                test_cfg['score_thr'] = bbox_thr
+            if 'nms_thr' in test_cfg:
+                test_cfg['nms_thr'] = nms_thr
+            self.model.test_cfg = test_cfg
+        for i, input in enumerate(inputs):
+            bbox = bboxes[i] if bboxes else []
+            data_infos = self.preprocess_single(
+                input,
+                index=i,
+                bboxes=bbox,
+                bbox_thr=bbox_thr,
+                nms_thr=nms_thr,
+                **kwargs)
+            # only supports inference with batch size 1
+            yield self.collate_fn(data_infos), [input]
+    def __call__(
+        self,
+        inputs: InputsType,
+        return_datasamples: bool = False,
+        batch_size: int = 1,
+        out_dir: Optional[str] = None,
+        **kwargs,
+    ) -> dict:
+        """Call the inferencer.
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            return_datasamples (bool): Whether to return results as
+                :obj:`BaseDataElement`. Defaults to False.
+            batch_size (int): Batch size. Defaults to 1.
+            out_dir (str, optional): directory to save visualization
+                results and predictions. Will be overoden if vis_out_dir or
+                pred_out_dir are given. Defaults to None
+            **kwargs: Key words arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``,
+                ``visualize_kwargs`` and ``postprocess_kwargs``.
+        Returns:
+            dict: Inference and visualization results.
+        """
+        if out_dir is not None:
+            if 'vis_out_dir' not in kwargs:
+                kwargs['vis_out_dir'] = f'{out_dir}/visualizations'
+            if 'pred_out_dir' not in kwargs:
+                kwargs['pred_out_dir'] = f'{out_dir}/predictions'
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+        self.update_model_visualizer_settings(**kwargs)
+        # preprocessing
+        if isinstance(inputs, str) and inputs.startswith('webcam'):
+            inputs = self._get_webcam_inputs(inputs)
+            batch_size = 1
+            if not visualize_kwargs.get('show', False):
+                print_log(
+                    'The display mode is closed when using webcam '
+                    'input. It will be turned on automatically.',
+                    logger='current',
+                    level=logging.WARNING)
+            visualize_kwargs['show'] = True
+        else:
+            inputs = self._inputs_to_list(inputs)
+        # check the compatibility between inputs/outputs
+        if not self._video_input and len(inputs) > 0:
+            vis_out_dir = visualize_kwargs.get('vis_out_dir', None)
+            if vis_out_dir is not None:
+                _, file_extension = os.path.splitext(vis_out_dir)
+                assert not file_extension, f'the argument `vis_out_dir` ' \
+                    f'should be a folder while the input contains multiple ' \
+                    f'images, but got {vis_out_dir}'
+        if 'bbox_thr' in self.forward_kwargs:
+            forward_kwargs['bbox_thr'] = preprocess_kwargs.get('bbox_thr', -1)
+        inputs = self.preprocess(
+            inputs, batch_size=batch_size, **preprocess_kwargs)
+        preds = []
+        for proc_inputs, ori_inputs in (track(inputs, description='Inference')
+                                        if self.show_progress else inputs):
+            preds = self.forward(proc_inputs, **forward_kwargs)
+            visualization = self.visualize(ori_inputs, preds,
+                                           **visualize_kwargs)
+            results = self.postprocess(
+                preds,
+                visualization,
+                return_datasamples=return_datasamples,
+                **postprocess_kwargs)
+            yield results
+        if self._video_input:
+            self._finalize_video_processing(
+                postprocess_kwargs.get('pred_out_dir', ''))
+        # In 3D Inferencers, some intermediate results (e.g. 2d keypoints)
+        # will be temporarily stored in `self._buffer`. It's essential to
+        # clear this information to prevent any interference with subsequent
+        # inferences.
+        if hasattr(self, '_buffer'):
+            self._buffer.clear()
+    def visualize(self,
+                  inputs: list,
+                  preds: List[PoseDataSample],
+                  return_vis: bool = False,
+                  show: bool = False,
+                  draw_bbox: bool = False,
+                  wait_time: float = 0,
+                  radius: int = 3,
+                  thickness: int = 1,
+                  kpt_thr: float = 0.3,
+                  vis_out_dir: str = '',
+                  window_name: str = '',
+                  black_background: bool = False,
+                  **kwargs) -> List[np.ndarray]:
+        """Visualize predictions.
+        Args:
+            inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`.
+            preds (Any): Predictions of the model.
+            return_vis (bool): Whether to return images with predicted results.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (ms). Defaults to 0
+            draw_bbox (bool): Whether to draw the bounding boxes.
+                Defaults to False
+            radius (int): Keypoint radius for visualization. Defaults to 3
+            thickness (int): Link thickness for visualization. Defaults to 1
+            kpt_thr (float): The threshold to visualize the keypoints.
+                Defaults to 0.3
+            vis_out_dir (str, optional): Directory to save visualization
+                results w/o predictions. If left as empty, no file will
+                be saved. Defaults to ''.
+            window_name (str, optional): Title of display window.
+            black_background (bool, optional): Whether to plot keypoints on a
+                black image instead of the input image. Defaults to False.
+        Returns:
+            List[np.ndarray]: Visualization results.
+        """
+        if (not return_vis) and (not show) and (not vis_out_dir):
+            return
+        if getattr(self, 'visualizer', None) is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+        self.visualizer.radius = radius
+        self.visualizer.line_width = thickness
+        results = []
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                img = mmcv.imread(single_input, channel_order='rgb')
+            elif isinstance(single_input, np.ndarray):
+                img = mmcv.bgr2rgb(single_input)
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(single_input)}')
+            if black_background:
+                img = img * 0
+            img_name = os.path.basename(pred.metainfo['img_path'])
+            window_name = window_name if window_name else img_name
+            # since visualization and inference utilize the same process,
+            # the wait time is reduced when a video input is utilized,
+            # thereby eliminating the issue of inference getting stuck.
+            wait_time = 1e-5 if self._video_input else wait_time
+            visualization = self.visualizer.add_datasample(
+                window_name,
+                img,
+                pred,
+                draw_gt=False,
+                draw_bbox=draw_bbox,
+                show=show,
+                wait_time=wait_time,
+                kpt_thr=kpt_thr,
+                **kwargs)
+            results.append(visualization)
+            if vis_out_dir:
+                self.save_visualization(
+                    visualization,
+                    vis_out_dir,
+                    img_name=img_name,
+                )
+        if return_vis:
+            return results
+        else:
+            return []
+    def save_visualization(self, visualization, vis_out_dir, img_name=None):
+        out_img = mmcv.rgb2bgr(visualization)
+        _, file_extension = os.path.splitext(vis_out_dir)
+        if file_extension:
+            dir_name = os.path.dirname(vis_out_dir)
+            file_name = os.path.basename(vis_out_dir)
+        else:
+            dir_name = vis_out_dir
+            file_name = None
+        mkdir_or_exist(dir_name)
+        if self._video_input:
+            if self.video_info['writer'] is None:
+                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+                if file_name is None:
+                    file_name = os.path.basename(self.video_info['name'])
+                out_file = join_path(dir_name, file_name)
+                self.video_info['output_file'] = out_file
+                self.video_info['writer'] = cv2.VideoWriter(
+                    out_file, fourcc, self.video_info['fps'],
+                    (visualization.shape[1], visualization.shape[0]))
+            self.video_info['writer'].write(out_img)
+        else:
+            if file_name is None:
+                file_name = img_name if img_name else 'visualization.jpg'
+            out_file = join_path(dir_name, file_name)
+            mmcv.imwrite(out_img, out_file)
+            print_log(
+                f'the output image has been saved at {out_file}',
+                logger='current',
+                level=logging.INFO)
+    def postprocess(
+        self,
+        preds: List[PoseDataSample],
+        visualization: List[np.ndarray],
+        return_datasample=None,
+        return_datasamples=False,
+        pred_out_dir: str = '',
+    ) -> dict:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+        This method should be responsible for the following tasks:
+        1. Convert datasamples into a json-serializable dict if needed.
+        2. Pack the predictions and visualization results and return them.
+        3. Dump or log the predictions.
+        Args:
+            preds (List[Dict]): Predictions of the model.
+            visualization (np.ndarray): Visualized predictions.
+            return_datasamples (bool): Whether to return results as
+                datasamples. Defaults to False.
+            pred_out_dir (str): Directory to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``
+            - ``visualization (Any)``: Returned by :meth:`visualize`
+            - ``predictions`` (dict or DataSample): Returned by
+              :meth:`forward` and processed in :meth:`postprocess`.
+              If ``return_datasamples=False``, it usually should be a
+              json-serializable dict containing only basic data elements such
+              as strings and numbers.
+        """
+        if return_datasample is not None:
+            print_log(
+                'The `return_datasample` argument is deprecated '
+                'and will be removed in future versions. Please '
+                'use `return_datasamples`.',
+                logger='current',
+                level=logging.WARNING)
+            return_datasamples = return_datasample
+        result_dict = defaultdict(list)
+        result_dict['visualization'] = visualization
+        for pred in preds:
+            if not return_datasamples:
+                # convert datasamples to list of instance predictions
+                pred = split_instances(pred.pred_instances)
+            result_dict['predictions'].append(pred)
+        if pred_out_dir != '':
+            for pred, data_sample in zip(result_dict['predictions'], preds):
+                if self._video_input:
+                    # For video or webcam input, predictions for each frame
+                    # are gathered in the 'predictions' key of 'video_info'
+                    # dictionary. All frame predictions are then stored into
+                    # a single file after processing all frames.
+                    self.video_info['predictions'].append(pred)
+                else:
+                    # For non-video inputs, predictions are stored in separate
+                    # JSON files. The filename is determined by the basename
+                    # of the input image path with a '.json' extension. The
+                    # predictions are then dumped into this file.
+                    fname = os.path.splitext(
+                        os.path.basename(
+                            data_sample.metainfo['img_path']))[0] + '.json'
+                    mmengine.dump(
+                        pred, join_path(pred_out_dir, fname), indent='  ')
+        return result_dict
+    def _finalize_video_processing(
+        self,
+        pred_out_dir: str = '',
+    ):
+        """Finalize video processing by releasing the video writer and saving
+        predictions to a file.
+        This method should be called after completing the video processing. It
+        releases the video writer, if it exists, and saves the predictions to a
+        JSON file if a prediction output directory is provided.
+        """
+        # Release the video writer if it exists
+        if self.video_info['writer'] is not None:
+            out_file = self.video_info['output_file']
+            print_log(
+                f'the output video has been saved at {out_file}',
+                logger='current',
+                level=logging.INFO)
+            self.video_info['writer'].release()
+        # Save predictions
+        if pred_out_dir:
+            fname = os.path.splitext(
+                os.path.basename(self.video_info['name']))[0] + '.json'
+            predictions = [
+                dict(frame_id=i, instances=pred)
+                for i, pred in enumerate(self.video_info['predictions'])
+            ]
+            mmengine.dump(
+                predictions, join_path(pred_out_dir, fname), indent='  ')

mmpose/apis/inferencers/hand3d_inferencer.py ADDED Viewed

	@@ -0,0 +1,344 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from collections import defaultdict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+import mmcv
+import numpy as np
+import torch
+from mmengine.config import Config, ConfigDict
+from mmengine.infer.infer import ModelType
+from mmengine.logging import print_log
+from mmengine.model import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.structures import InstanceData
+from mmpose.evaluation.functional import nms
+from mmpose.registry import INFERENCERS
+from mmpose.structures import PoseDataSample, merge_data_samples
+from .base_mmpose_inferencer import BaseMMPoseInferencer
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ConfigType = Union[Config, ConfigDict]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+@INFERENCERS.register_module()
+class Hand3DInferencer(BaseMMPoseInferencer):
+    """The inferencer for 3D hand pose estimation.
+    Args:
+        model (str, optional): Pretrained 2D pose estimation algorithm.
+            It's the path to the config file or the model name defined in
+            metafile. For example, it could be:
+            - model alias, e.g. ``'body'``,
+            - config name, e.g. ``'simcc_res50_8xb64-210e_coco-256x192'``,
+            - config path
+            Defaults to ``None``.
+        weights (str, optional): Path to the checkpoint. If it is not
+            specified and "model" is a model name of metafile, the weights
+            will be loaded from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the
+            available device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to "mmpose".
+        det_model (str, optional): Config path or alias of detection model.
+            Defaults to None.
+        det_weights (str, optional): Path to the checkpoints of detection
+            model. Defaults to None.
+        det_cat_ids (int or list[int], optional): Category id for
+            detection model. Defaults to None.
+    """
+    preprocess_kwargs: set = {'bbox_thr', 'nms_thr', 'bboxes'}
+    forward_kwargs: set = {'disable_rebase_keypoint'}
+    visualize_kwargs: set = {
+        'return_vis',
+        'show',
+        'wait_time',
+        'draw_bbox',
+        'radius',
+        'thickness',
+        'kpt_thr',
+        'vis_out_dir',
+        'num_instances',
+    }
+    postprocess_kwargs: set = {'pred_out_dir', 'return_datasample'}
+    def __init__(self,
+                 model: Union[ModelType, str],
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = 'mmpose',
+                 det_model: Optional[Union[ModelType, str]] = None,
+                 det_weights: Optional[str] = None,
+                 det_cat_ids: Optional[Union[int, Tuple]] = None,
+                 show_progress: bool = False) -> None:
+        init_default_scope(scope)
+        super().__init__(
+            model=model,
+            weights=weights,
+            device=device,
+            scope=scope,
+            show_progress=show_progress)
+        self.model = revert_sync_batchnorm(self.model)
+        # assign dataset metainfo to self.visualizer
+        self.visualizer.set_dataset_meta(self.model.dataset_meta)
+        # initialize hand detector
+        self._init_detector(
+            det_model=det_model,
+            det_weights=det_weights,
+            det_cat_ids=det_cat_ids,
+            device=device,
+        )
+        self._video_input = False
+        self._buffer = defaultdict(list)
+    def preprocess_single(self,
+                          input: InputType,
+                          index: int,
+                          bbox_thr: float = 0.3,
+                          nms_thr: float = 0.3,
+                          bboxes: Union[List[List], List[np.ndarray],
+                                        np.ndarray] = []):
+        """Process a single input into a model-feedable format.
+        Args:
+            input (InputType): Input given by user.
+            index (int): index of the input
+            bbox_thr (float): threshold for bounding box detection.
+                Defaults to 0.3.
+            nms_thr (float): IoU threshold for bounding box NMS.
+                Defaults to 0.3.
+        Yields:
+            Any: Data processed by the ``pipeline`` and ``collate_fn``.
+        """
+        if isinstance(input, str):
+            data_info = dict(img_path=input)
+        else:
+            data_info = dict(img=input, img_path=f'{index}.jpg'.rjust(10, '0'))
+        data_info.update(self.model.dataset_meta)
+        if self.detector is not None:
+            try:
+                det_results = self.detector(
+                    input, return_datasamples=True)['predictions']
+            except ValueError:
+                print_log(
+                    'Support for mmpose and mmdet versions up to 3.1.0 '
+                    'will be discontinued in upcoming releases. To '
+                    'ensure ongoing compatibility, please upgrade to '
+                    'mmdet version 3.2.0 or later.',
+                    logger='current',
+                    level=logging.WARNING)
+                det_results = self.detector(
+                    input, return_datasample=True)['predictions']
+            pred_instance = det_results[0].pred_instances.cpu().numpy()
+            bboxes = np.concatenate(
+                (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1)
+            label_mask = np.zeros(len(bboxes), dtype=np.uint8)
+            for cat_id in self.det_cat_ids:
+                label_mask = np.logical_or(label_mask,
+                                           pred_instance.labels == cat_id)
+            bboxes = bboxes[np.logical_and(label_mask,
+                                           pred_instance.scores > bbox_thr)]
+            bboxes = bboxes[nms(bboxes, nms_thr)]
+        data_infos = []
+        if len(bboxes) > 0:
+            for bbox in bboxes:
+                inst = data_info.copy()
+                inst['bbox'] = bbox[None, :4]
+                inst['bbox_score'] = bbox[4:5]
+                data_infos.append(self.pipeline(inst))
+        else:
+            inst = data_info.copy()
+            # get bbox from the image size
+            if isinstance(input, str):
+                input = mmcv.imread(input)
+            h, w = input.shape[:2]
+            inst['bbox'] = np.array([[0, 0, w, h]], dtype=np.float32)
+            inst['bbox_score'] = np.ones(1, dtype=np.float32)
+            data_infos.append(self.pipeline(inst))
+        return data_infos
+    @torch.no_grad()
+    def forward(self,
+                inputs: Union[dict, tuple],
+                disable_rebase_keypoint: bool = False):
+        """Performs a forward pass through the model.
+        Args:
+            inputs (Union[dict, tuple]): The input data to be processed. Can
+                be either a dictionary or a tuple.
+            disable_rebase_keypoint (bool, optional): Flag to disable rebasing
+                the height of the keypoints. Defaults to False.
+        Returns:
+            A list of data samples with prediction instances.
+        """
+        data_samples = self.model.test_step(inputs)
+        data_samples_2d = []
+        for idx, res in enumerate(data_samples):
+            pred_instances = res.pred_instances
+            keypoints = pred_instances.keypoints
+            rel_root_depth = pred_instances.rel_root_depth
+            scores = pred_instances.keypoint_scores
+            hand_type = pred_instances.hand_type
+            res_2d = PoseDataSample()
+            gt_instances = res.gt_instances.clone()
+            pred_instances = pred_instances.clone()
+            res_2d.gt_instances = gt_instances
+            res_2d.pred_instances = pred_instances
+            # add relative root depth to left hand joints
+            keypoints[:, 21:, 2] += rel_root_depth
+            # set joint scores according to hand type
+            scores[:, :21] *= hand_type[:, [0]]
+            scores[:, 21:] *= hand_type[:, [1]]
+            # normalize kpt score
+            if scores.max() > 1:
+                scores /= 255
+            res_2d.pred_instances.set_field(keypoints[..., :2].copy(),
+                                            'keypoints')
+            # rotate the keypoint to make z-axis correspondent to height
+            # for better visualization
+            vis_R = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+            keypoints[..., :3] = keypoints[..., :3] @ vis_R
+            # rebase height (z-axis)
+            if not disable_rebase_keypoint:
+                valid = scores > 0
+                keypoints[..., 2] -= np.min(
+                    keypoints[valid, 2], axis=-1, keepdims=True)
+            data_samples[idx].pred_instances.keypoints = keypoints
+            data_samples[idx].pred_instances.keypoint_scores = scores
+            data_samples_2d.append(res_2d)
+        data_samples = [merge_data_samples(data_samples)]
+        data_samples_2d = merge_data_samples(data_samples_2d)
+        self._buffer['pose2d_results'] = data_samples_2d
+        return data_samples
+    def visualize(
+        self,
+        inputs: list,
+        preds: List[PoseDataSample],
+        return_vis: bool = False,
+        show: bool = False,
+        draw_bbox: bool = False,
+        wait_time: float = 0,
+        radius: int = 3,
+        thickness: int = 1,
+        kpt_thr: float = 0.3,
+        num_instances: int = 1,
+        vis_out_dir: str = '',
+        window_name: str = '',
+    ) -> List[np.ndarray]:
+        """Visualize predictions.
+        Args:
+            inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`.
+            preds (Any): Predictions of the model.
+            return_vis (bool): Whether to return images with predicted results.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (ms). Defaults to 0
+            draw_bbox (bool): Whether to draw the bounding boxes.
+                Defaults to False
+            radius (int): Keypoint radius for visualization. Defaults to 3
+            thickness (int): Link thickness for visualization. Defaults to 1
+            kpt_thr (float): The threshold to visualize the keypoints.
+                Defaults to 0.3
+            vis_out_dir (str, optional): Directory to save visualization
+                results w/o predictions. If left as empty, no file will
+                be saved. Defaults to ''.
+            window_name (str, optional): Title of display window.
+            window_close_event_handler (callable, optional):
+        Returns:
+            List[np.ndarray]: Visualization results.
+        """
+        if (not return_vis) and (not show) and (not vis_out_dir):
+            return
+        if getattr(self, 'visualizer', None) is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+        self.visualizer.radius = radius
+        self.visualizer.line_width = thickness
+        results = []
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                img = mmcv.imread(single_input, channel_order='rgb')
+            elif isinstance(single_input, np.ndarray):
+                img = mmcv.bgr2rgb(single_input)
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(single_input)}')
+            img_name = os.path.basename(pred.metainfo['img_path'])
+            # since visualization and inference utilize the same process,
+            # the wait time is reduced when a video input is utilized,
+            # thereby eliminating the issue of inference getting stuck.
+            wait_time = 1e-5 if self._video_input else wait_time
+            if num_instances < 0:
+                num_instances = len(pred.pred_instances)
+            visualization = self.visualizer.add_datasample(
+                window_name,
+                img,
+                data_sample=pred,
+                det_data_sample=self._buffer['pose2d_results'],
+                draw_gt=False,
+                draw_bbox=draw_bbox,
+                show=show,
+                wait_time=wait_time,
+                convert_keypoint=False,
+                axis_azimuth=-115,
+                axis_limit=200,
+                axis_elev=15,
+                kpt_thr=kpt_thr,
+                num_instances=num_instances)
+            results.append(visualization)
+            if vis_out_dir:
+                self.save_visualization(
+                    visualization,
+                    vis_out_dir,
+                    img_name=img_name,
+                )
+        if return_vis:
+            return results
+        else:
+            return []

mmpose/apis/inferencers/mmpose_inferencer.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Dict, List, Optional, Sequence, Union
+import numpy as np
+import torch
+from mmengine.config import Config, ConfigDict
+from mmengine.infer.infer import ModelType
+from mmengine.structures import InstanceData
+from rich.progress import track
+from .base_mmpose_inferencer import BaseMMPoseInferencer
+from .hand3d_inferencer import Hand3DInferencer
+from .pose2d_inferencer import Pose2DInferencer
+from .pose3d_inferencer import Pose3DInferencer
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ConfigType = Union[Config, ConfigDict]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+class MMPoseInferencer(BaseMMPoseInferencer):
+    """MMPose Inferencer. It's a unified inferencer interface for pose
+    estimation task, currently including: Pose2D. and it can be used to perform
+    2D keypoint detection.
+    Args:
+        pose2d (str, optional): Pretrained 2D pose estimation algorithm.
+            It's the path to the config file or the model name defined in
+            metafile. For example, it could be:
+            - model alias, e.g. ``'body'``,
+            - config name, e.g. ``'simcc_res50_8xb64-210e_coco-256x192'``,
+            - config path
+            Defaults to ``None``.
+        pose2d_weights (str, optional): Path to the custom checkpoint file of
+            the selected pose2d model. If it is not specified and "pose2d" is
+            a model name of metafile, the weights will be loaded from
+            metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the
+            available device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to "mmpose".
+        det_model(str, optional): Config path or alias of detection model.
+            Defaults to None.
+        det_weights(str, optional): Path to the checkpoints of detection
+            model. Defaults to None.
+        det_cat_ids(int or list[int], optional): Category id for
+            detection model. Defaults to None.
+        output_heatmaps (bool, optional): Flag to visualize predicted
+            heatmaps. If set to None, the default setting from the model
+            config will be used. Default is None.
+    """
+    preprocess_kwargs: set = {
+        'bbox_thr', 'nms_thr', 'bboxes', 'use_oks_tracking', 'tracking_thr',
+        'disable_norm_pose_2d'
+    }
+    forward_kwargs: set = {
+        'merge_results', 'disable_rebase_keypoint', 'pose_based_nms'
+    }
+    visualize_kwargs: set = {
+        'return_vis', 'show', 'wait_time', 'draw_bbox', 'radius', 'thickness',
+        'kpt_thr', 'vis_out_dir', 'skeleton_style', 'draw_heatmap',
+        'black_background', 'num_instances'
+    }
+    postprocess_kwargs: set = {'pred_out_dir', 'return_datasample'}
+    def __init__(self,
+                 pose2d: Optional[str] = None,
+                 pose2d_weights: Optional[str] = None,
+                 pose3d: Optional[str] = None,
+                 pose3d_weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: str = 'mmpose',
+                 det_model: Optional[Union[ModelType, str]] = None,
+                 det_weights: Optional[str] = None,
+                 det_cat_ids: Optional[Union[int, List]] = None,
+                 show_progress: bool = False) -> None:
+        self.visualizer = None
+        self.show_progress = show_progress
+        if pose3d is not None:
+            if 'hand3d' in pose3d:
+                self.inferencer = Hand3DInferencer(pose3d, pose3d_weights,
+                                                   device, scope, det_model,
+                                                   det_weights, det_cat_ids,
+                                                   show_progress)
+            else:
+                self.inferencer = Pose3DInferencer(pose3d, pose3d_weights,
+                                                   pose2d, pose2d_weights,
+                                                   device, scope, det_model,
+                                                   det_weights, det_cat_ids,
+                                                   show_progress)
+        elif pose2d is not None:
+            self.inferencer = Pose2DInferencer(pose2d, pose2d_weights, device,
+                                               scope, det_model, det_weights,
+                                               det_cat_ids, show_progress)
+        else:
+            raise ValueError('Either 2d or 3d pose estimation algorithm '
+                             'should be provided.')
+    def preprocess(self, inputs: InputsType, batch_size: int = 1, **kwargs):
+        """Process the inputs into a model-feedable format.
+        Args:
+            inputs (InputsType): Inputs given by user.
+            batch_size (int): batch size. Defaults to 1.
+        Yields:
+            Any: Data processed by the ``pipeline`` and ``collate_fn``.
+            List[str or np.ndarray]: List of original inputs in the batch
+        """
+        for data in self.inferencer.preprocess(inputs, batch_size, **kwargs):
+            yield data
+    @torch.no_grad()
+    def forward(self, inputs: InputType, **forward_kwargs) -> PredType:
+        """Forward the inputs to the model.
+        Args:
+            inputs (InputsType): The inputs to be forwarded.
+        Returns:
+            Dict: The prediction results. Possibly with keys "pose2d".
+        """
+        return self.inferencer.forward(inputs, **forward_kwargs)
+    def __call__(
+        self,
+        inputs: InputsType,
+        return_datasamples: bool = False,
+        batch_size: int = 1,
+        out_dir: Optional[str] = None,
+        **kwargs,
+    ) -> dict:
+        """Call the inferencer.
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            return_datasamples (bool): Whether to return results as
+                :obj:`BaseDataElement`. Defaults to False.
+            batch_size (int): Batch size. Defaults to 1.
+            out_dir (str, optional): directory to save visualization
+                results and predictions. Will be overoden if vis_out_dir or
+                pred_out_dir are given. Defaults to None
+            **kwargs: Key words arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``,
+                ``visualize_kwargs`` and ``postprocess_kwargs``.
+        Returns:
+            dict: Inference and visualization results.
+        """
+        if out_dir is not None:
+            if 'vis_out_dir' not in kwargs:
+                kwargs['vis_out_dir'] = f'{out_dir}/visualizations'
+            if 'pred_out_dir' not in kwargs:
+                kwargs['pred_out_dir'] = f'{out_dir}/predictions'
+        kwargs = {
+            key: value
+            for key, value in kwargs.items()
+            if key in set.union(self.inferencer.preprocess_kwargs,
+                                self.inferencer.forward_kwargs,
+                                self.inferencer.visualize_kwargs,
+                                self.inferencer.postprocess_kwargs)
+        }
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+        self.inferencer.update_model_visualizer_settings(**kwargs)
+        # preprocessing
+        if isinstance(inputs, str) and inputs.startswith('webcam'):
+            inputs = self.inferencer._get_webcam_inputs(inputs)
+            batch_size = 1
+            if not visualize_kwargs.get('show', False):
+                warnings.warn('The display mode is closed when using webcam '
+                              'input. It will be turned on automatically.')
+            visualize_kwargs['show'] = True
+        else:
+            inputs = self.inferencer._inputs_to_list(inputs)
+        self._video_input = self.inferencer._video_input
+        if self._video_input:
+            self.video_info = self.inferencer.video_info
+        inputs = self.preprocess(
+            inputs, batch_size=batch_size, **preprocess_kwargs)
+        # forward
+        if 'bbox_thr' in self.inferencer.forward_kwargs:
+            forward_kwargs['bbox_thr'] = preprocess_kwargs.get('bbox_thr', -1)
+        preds = []
+        for proc_inputs, ori_inputs in (track(inputs, description='Inference')
+                                        if self.show_progress else inputs):
+            preds = self.forward(proc_inputs, **forward_kwargs)
+            visualization = self.visualize(ori_inputs, preds,
+                                           **visualize_kwargs)
+            results = self.postprocess(
+                preds,
+                visualization,
+                return_datasamples=return_datasamples,
+                **postprocess_kwargs)
+            yield results
+        if self._video_input:
+            self._finalize_video_processing(
+                postprocess_kwargs.get('pred_out_dir', ''))
+    def visualize(self, inputs: InputsType, preds: PredType,
+                  **kwargs) -> List[np.ndarray]:
+        """Visualize predictions.
+        Args:
+            inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`.
+            preds (Any): Predictions of the model.
+            return_vis (bool): Whether to return images with predicted results.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            show_interval (int): The interval of show (s). Defaults to 0
+            radius (int): Keypoint radius for visualization. Defaults to 3
+            thickness (int): Link thickness for visualization. Defaults to 1
+            kpt_thr (float): The threshold to visualize the keypoints.
+                Defaults to 0.3
+            vis_out_dir (str, optional): directory to save visualization
+                results w/o predictions. If left as empty, no file will
+                be saved. Defaults to ''.
+        Returns:
+            List[np.ndarray]: Visualization results.
+        """
+        window_name = ''
+        if self.inferencer._video_input:
+            window_name = self.inferencer.video_info['name']
+        return self.inferencer.visualize(
+            inputs, preds, window_name=window_name, **kwargs)

mmpose/apis/inferencers/pose2d_inferencer.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+import mmcv
+import numpy as np
+import torch
+from mmengine.config import Config, ConfigDict
+from mmengine.infer.infer import ModelType
+from mmengine.logging import print_log
+from mmengine.model import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.structures import InstanceData
+from mmpose.evaluation.functional import nearby_joints_nms, nms
+from mmpose.registry import INFERENCERS
+from mmpose.structures import merge_data_samples
+from .base_mmpose_inferencer import BaseMMPoseInferencer
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ConfigType = Union[Config, ConfigDict]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+@INFERENCERS.register_module(name='pose-estimation')
+@INFERENCERS.register_module()
+class Pose2DInferencer(BaseMMPoseInferencer):
+    """The inferencer for 2D pose estimation.
+    Args:
+        model (str, optional): Pretrained 2D pose estimation algorithm.
+            It's the path to the config file or the model name defined in
+            metafile. For example, it could be:
+            - model alias, e.g. ``'body'``,
+            - config name, e.g. ``'simcc_res50_8xb64-210e_coco-256x192'``,
+            - config path
+            Defaults to ``None``.
+        weights (str, optional): Path to the checkpoint. If it is not
+            specified and "model" is a model name of metafile, the weights
+            will be loaded from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the
+            available device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to "mmpose".
+        det_model (str, optional): Config path or alias of detection model.
+            Defaults to None.
+        det_weights (str, optional): Path to the checkpoints of detection
+            model. Defaults to None.
+        det_cat_ids (int or list[int], optional): Category id for
+            detection model. Defaults to None.
+    """
+    preprocess_kwargs: set = {'bbox_thr', 'nms_thr', 'bboxes'}
+    forward_kwargs: set = {'merge_results', 'pose_based_nms'}
+    visualize_kwargs: set = {
+        'return_vis',
+        'show',
+        'wait_time',
+        'draw_bbox',
+        'radius',
+        'thickness',
+        'kpt_thr',
+        'vis_out_dir',
+        'skeleton_style',
+        'draw_heatmap',
+        'black_background',
+    }
+    postprocess_kwargs: set = {'pred_out_dir', 'return_datasample'}
+    def __init__(self,
+                 model: Union[ModelType, str],
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = 'mmpose',
+                 det_model: Optional[Union[ModelType, str]] = None,
+                 det_weights: Optional[str] = None,
+                 det_cat_ids: Optional[Union[int, Tuple]] = None,
+                 show_progress: bool = False) -> None:
+        init_default_scope(scope)
+        super().__init__(
+            model=model,
+            weights=weights,
+            device=device,
+            scope=scope,
+            show_progress=show_progress)
+        self.model = revert_sync_batchnorm(self.model)
+        # assign dataset metainfo to self.visualizer
+        self.visualizer.set_dataset_meta(self.model.dataset_meta)
+        # initialize detector for top-down models
+        if self.cfg.data_mode == 'topdown':
+            self._init_detector(
+                det_model=det_model,
+                det_weights=det_weights,
+                det_cat_ids=det_cat_ids,
+                device=device,
+            )
+        self._video_input = False
+    def update_model_visualizer_settings(self,
+                                         draw_heatmap: bool = False,
+                                         skeleton_style: str = 'mmpose',
+                                         **kwargs) -> None:
+        """Update the settings of models and visualizer according to inference
+        arguments.
+        Args:
+            draw_heatmaps (bool, optional): Flag to visualize predicted
+                heatmaps. If not provided, it defaults to False.
+            skeleton_style (str, optional): Skeleton style selection. Valid
+                options are 'mmpose' and 'openpose'. Defaults to 'mmpose'.
+        """
+        self.model.test_cfg['output_heatmaps'] = draw_heatmap
+        if skeleton_style not in ['mmpose', 'openpose']:
+            raise ValueError('`skeleton_style` must be either \'mmpose\' '
+                             'or \'openpose\'')
+        if skeleton_style == 'openpose':
+            self.visualizer.set_dataset_meta(self.model.dataset_meta,
+                                             skeleton_style)
+    def preprocess_single(self,
+                          input: InputType,
+                          index: int,
+                          bbox_thr: float = 0.3,
+                          nms_thr: float = 0.3,
+                          bboxes: Union[List[List], List[np.ndarray],
+                                        np.ndarray] = []):
+        """Process a single input into a model-feedable format.
+        Args:
+            input (InputType): Input given by user.
+            index (int): index of the input
+            bbox_thr (float): threshold for bounding box detection.
+                Defaults to 0.3.
+            nms_thr (float): IoU threshold for bounding box NMS.
+                Defaults to 0.3.
+        Yields:
+            Any: Data processed by the ``pipeline`` and ``collate_fn``.
+        """
+        if isinstance(input, str):
+            data_info = dict(img_path=input)
+        else:
+            data_info = dict(img=input, img_path=f'{index}.jpg'.rjust(10, '0'))
+        data_info.update(self.model.dataset_meta)
+        if self.cfg.data_mode == 'topdown':
+            bboxes = []
+            if self.detector is not None:
+                try:
+                    det_results = self.detector(
+                        input, return_datasamples=True)['predictions']
+                except ValueError:
+                    print_log(
+                        'Support for mmpose and mmdet versions up to 3.1.0 '
+                        'will be discontinued in upcoming releases. To '
+                        'ensure ongoing compatibility, please upgrade to '
+                        'mmdet version 3.2.0 or later.',
+                        logger='current',
+                        level=logging.WARNING)
+                    det_results = self.detector(
+                        input, return_datasample=True)['predictions']
+                pred_instance = det_results[0].pred_instances.cpu().numpy()
+                bboxes = np.concatenate(
+                    (pred_instance.bboxes, pred_instance.scores[:, None]),
+                    axis=1)
+                label_mask = np.zeros(len(bboxes), dtype=np.uint8)
+                for cat_id in self.det_cat_ids:
+                    label_mask = np.logical_or(label_mask,
+                                               pred_instance.labels == cat_id)
+                bboxes = bboxes[np.logical_and(
+                    label_mask, pred_instance.scores > bbox_thr)]
+                bboxes = bboxes[nms(bboxes, nms_thr)]
+            data_infos = []
+            if len(bboxes) > 0:
+                for bbox in bboxes:
+                    inst = data_info.copy()
+                    inst['bbox'] = bbox[None, :4]
+                    inst['bbox_score'] = bbox[4:5]
+                    data_infos.append(self.pipeline(inst))
+            else:
+                inst = data_info.copy()
+                # get bbox from the image size
+                if isinstance(input, str):
+                    input = mmcv.imread(input)
+                h, w = input.shape[:2]
+                inst['bbox'] = np.array([[0, 0, w, h]], dtype=np.float32)
+                inst['bbox_score'] = np.ones(1, dtype=np.float32)
+                data_infos.append(self.pipeline(inst))
+        else:  # bottom-up
+            data_infos = [self.pipeline(data_info)]
+        return data_infos
+    @torch.no_grad()
+    def forward(self,
+                inputs: Union[dict, tuple],
+                merge_results: bool = True,
+                bbox_thr: float = -1,
+                pose_based_nms: bool = False):
+        """Performs a forward pass through the model.
+        Args:
+            inputs (Union[dict, tuple]): The input data to be processed. Can
+                be either a dictionary or a tuple.
+            merge_results (bool, optional): Whether to merge data samples,
+                default to True. This is only applicable when the data_mode
+                is 'topdown'.
+            bbox_thr (float, optional): A threshold for the bounding box
+                scores. Bounding boxes with scores greater than this value
+                will be retained. Default value is -1 which retains all
+                bounding boxes.
+        Returns:
+            A list of data samples with prediction instances.
+        """
+        data_samples = self.model.test_step(inputs)
+        if self.cfg.data_mode == 'topdown' and merge_results:
+            data_samples = [merge_data_samples(data_samples)]
+        if bbox_thr > 0:
+            for ds in data_samples:
+                if 'bbox_scores' in ds.pred_instances:
+                    ds.pred_instances = ds.pred_instances[
+                        ds.pred_instances.bbox_scores > bbox_thr]
+        if pose_based_nms:
+            for ds in data_samples:
+                if len(ds.pred_instances) == 0:
+                    continue
+                kpts = ds.pred_instances.keypoints
+                scores = ds.pred_instances.bbox_scores
+                num_keypoints = kpts.shape[-2]
+                kept_indices = nearby_joints_nms(
+                    [
+                        dict(keypoints=kpts[i], score=scores[i])
+                        for i in range(len(kpts))
+                    ],
+                    num_nearby_joints_thr=num_keypoints // 3,
+                )
+                ds.pred_instances = ds.pred_instances[kept_indices]
+        return data_samples

mmpose/apis/inferencers/pose3d_inferencer.py ADDED Viewed

	@@ -0,0 +1,457 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from collections import defaultdict
+from functools import partial
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
+import mmcv
+import numpy as np
+import torch
+from mmengine.config import Config, ConfigDict
+from mmengine.infer.infer import ModelType
+from mmengine.model import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.structures import InstanceData
+from mmpose.apis import (_track_by_iou, _track_by_oks, collate_pose_sequence,
+                         convert_keypoint_definition, extract_pose_sequence)
+from mmpose.registry import INFERENCERS
+from mmpose.structures import PoseDataSample, merge_data_samples
+from .base_mmpose_inferencer import BaseMMPoseInferencer
+from .pose2d_inferencer import Pose2DInferencer
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ConfigType = Union[Config, ConfigDict]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+@INFERENCERS.register_module(name='pose-estimation-3d')
+@INFERENCERS.register_module()
+class Pose3DInferencer(BaseMMPoseInferencer):
+    """The inferencer for 3D pose estimation.
+    Args:
+        model (str, optional): Pretrained 2D pose estimation algorithm.
+            It's the path to the config file or the model name defined in
+            metafile. For example, it could be:
+            - model alias, e.g. ``'body'``,
+            - config name, e.g. ``'simcc_res50_8xb64-210e_coco-256x192'``,
+            - config path
+            Defaults to ``None``.
+        weights (str, optional): Path to the checkpoint. If it is not
+            specified and "model" is a model name of metafile, the weights
+            will be loaded from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the
+            available device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to "mmpose".
+        det_model (str, optional): Config path or alias of detection model.
+            Defaults to None.
+        det_weights (str, optional): Path to the checkpoints of detection
+            model. Defaults to None.
+        det_cat_ids (int or list[int], optional): Category id for
+            detection model. Defaults to None.
+        output_heatmaps (bool, optional): Flag to visualize predicted
+            heatmaps. If set to None, the default setting from the model
+            config will be used. Default is None.
+    """
+    preprocess_kwargs: set = {
+        'bbox_thr', 'nms_thr', 'bboxes', 'use_oks_tracking', 'tracking_thr',
+        'disable_norm_pose_2d'
+    }
+    forward_kwargs: set = {'disable_rebase_keypoint'}
+    visualize_kwargs: set = {
+        'return_vis',
+        'show',
+        'wait_time',
+        'draw_bbox',
+        'radius',
+        'thickness',
+        'num_instances',
+        'kpt_thr',
+        'vis_out_dir',
+    }
+    postprocess_kwargs: set = {'pred_out_dir', 'return_datasample'}
+    def __init__(self,
+                 model: Union[ModelType, str],
+                 weights: Optional[str] = None,
+                 pose2d_model: Optional[Union[ModelType, str]] = None,
+                 pose2d_weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = 'mmpose',
+                 det_model: Optional[Union[ModelType, str]] = None,
+                 det_weights: Optional[str] = None,
+                 det_cat_ids: Optional[Union[int, Tuple]] = None,
+                 show_progress: bool = False) -> None:
+        init_default_scope(scope)
+        super().__init__(
+            model=model,
+            weights=weights,
+            device=device,
+            scope=scope,
+            show_progress=show_progress)
+        self.model = revert_sync_batchnorm(self.model)
+        # assign dataset metainfo to self.visualizer
+        self.visualizer.set_dataset_meta(self.model.dataset_meta)
+        # initialize 2d pose estimator
+        self.pose2d_model = Pose2DInferencer(
+            pose2d_model if pose2d_model else 'human', pose2d_weights, device,
+            scope, det_model, det_weights, det_cat_ids)
+        # helper functions
+        self._keypoint_converter = partial(
+            convert_keypoint_definition,
+            pose_det_dataset=self.pose2d_model.model.
+            dataset_meta['dataset_name'],
+            pose_lift_dataset=self.model.dataset_meta['dataset_name'],
+        )
+        self._pose_seq_extractor = partial(
+            extract_pose_sequence,
+            causal=self.cfg.test_dataloader.dataset.get('causal', False),
+            seq_len=self.cfg.test_dataloader.dataset.get('seq_len', 1),
+            step=self.cfg.test_dataloader.dataset.get('seq_step', 1))
+        self._video_input = False
+        self._buffer = defaultdict(list)
+    def preprocess_single(self,
+                          input: InputType,
+                          index: int,
+                          bbox_thr: float = 0.3,
+                          nms_thr: float = 0.3,
+                          bboxes: Union[List[List], List[np.ndarray],
+                                        np.ndarray] = [],
+                          use_oks_tracking: bool = False,
+                          tracking_thr: float = 0.3,
+                          disable_norm_pose_2d: bool = False):
+        """Process a single input into a model-feedable format.
+        Args:
+            input (InputType): The input provided by the user.
+            index (int): The index of the input.
+            bbox_thr (float, optional): The threshold for bounding box
+                detection. Defaults to 0.3.
+            nms_thr (float, optional): The Intersection over Union (IoU)
+                threshold for bounding box Non-Maximum Suppression (NMS).
+                Defaults to 0.3.
+            bboxes (Union[List[List], List[np.ndarray], np.ndarray]):
+                The bounding boxes to use. Defaults to [].
+            use_oks_tracking (bool, optional): A flag that indicates
+                whether OKS-based tracking should be used. Defaults to False.
+            tracking_thr (float, optional): The threshold for tracking.
+                Defaults to 0.3.
+            disable_norm_pose_2d (bool, optional): A flag that indicates
+                whether 2D pose normalization should be used.
+                Defaults to False.
+        Yields:
+            Any: The data processed by the pipeline and collate_fn.
+        This method first calculates 2D keypoints using the provided
+        pose2d_model. The method also performs instance matching, which
+        can use either OKS-based tracking or IOU-based tracking.
+        """
+        # calculate 2d keypoints
+        results_pose2d = next(
+            self.pose2d_model(
+                input,
+                bbox_thr=bbox_thr,
+                nms_thr=nms_thr,
+                bboxes=bboxes,
+                merge_results=False,
+                return_datasamples=True))['predictions']
+        for ds in results_pose2d:
+            ds.pred_instances.set_field(
+                (ds.pred_instances.bboxes[..., 2:] -
+                 ds.pred_instances.bboxes[..., :2]).prod(-1), 'areas')
+        if not self._video_input:
+            height, width = results_pose2d[0].metainfo['ori_shape']
+            # Clear the buffer if inputs are individual images to prevent
+            # carryover effects from previous images
+            self._buffer.clear()
+        else:
+            height = self.video_info['height']
+            width = self.video_info['width']
+        img_path = results_pose2d[0].metainfo['img_path']
+        # instance matching
+        if use_oks_tracking:
+            _track = partial(_track_by_oks)
+        else:
+            _track = _track_by_iou
+        for result in results_pose2d:
+            track_id, self._buffer['results_pose2d_last'], _ = _track(
+                result, self._buffer['results_pose2d_last'], tracking_thr)
+            if track_id == -1:
+                pred_instances = result.pred_instances.cpu().numpy()
+                keypoints = pred_instances.keypoints
+                if np.count_nonzero(keypoints[:, :, 1]) >= 3:
+                    next_id = self._buffer.get('next_id', 0)
+                    result.set_field(next_id, 'track_id')
+                    self._buffer['next_id'] = next_id + 1
+                else:
+                    # If the number of keypoints detected is small,
+                    # delete that person instance.
+                    result.pred_instances.keypoints[..., 1] = -10
+                    result.pred_instances.bboxes *= 0
+                    result.set_field(-1, 'track_id')
+            else:
+                result.set_field(track_id, 'track_id')
+        self._buffer['pose2d_results'] = merge_data_samples(results_pose2d)
+        # convert keypoints
+        results_pose2d_converted = [ds.cpu().numpy() for ds in results_pose2d]
+        for ds in results_pose2d_converted:
+            ds.pred_instances.keypoints = self._keypoint_converter(
+                ds.pred_instances.keypoints)
+        self._buffer['pose_est_results_list'].append(results_pose2d_converted)
+        # extract and pad input pose2d sequence
+        pose_results_2d = self._pose_seq_extractor(
+            self._buffer['pose_est_results_list'],
+            frame_idx=index if self._video_input else 0)
+        causal = self.cfg.test_dataloader.dataset.get('causal', False)
+        target_idx = -1 if causal else len(pose_results_2d) // 2
+        stats_info = self.model.dataset_meta.get('stats_info', {})
+        bbox_center = stats_info.get('bbox_center', None)
+        bbox_scale = stats_info.get('bbox_scale', None)
+        pose_results_2d_copy = []
+        for pose_res in pose_results_2d:
+            pose_res_copy = []
+            for data_sample in pose_res:
+                data_sample_copy = PoseDataSample()
+                data_sample_copy.gt_instances = \
+                    data_sample.gt_instances.clone()
+                data_sample_copy.pred_instances = \
+                    data_sample.pred_instances.clone()
+                data_sample_copy.track_id = data_sample.track_id
+                kpts = data_sample.pred_instances.keypoints
+                bboxes = data_sample.pred_instances.bboxes
+                keypoints = []
+                for k in range(len(kpts)):
+                    kpt = kpts[k]
+                    if not disable_norm_pose_2d:
+                        bbox = bboxes[k]
+                        center = np.array([[(bbox[0] + bbox[2]) / 2,
+                                            (bbox[1] + bbox[3]) / 2]])
+                        scale = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
+                        keypoints.append((kpt[:, :2] - center) / scale *
+                                         bbox_scale + bbox_center)
+                    else:
+                        keypoints.append(kpt[:, :2])
+                data_sample_copy.pred_instances.set_field(
+                    np.array(keypoints), 'keypoints')
+                pose_res_copy.append(data_sample_copy)
+            pose_results_2d_copy.append(pose_res_copy)
+        pose_sequences_2d = collate_pose_sequence(pose_results_2d_copy, True,
+                                                  target_idx)
+        if not pose_sequences_2d:
+            return []
+        data_list = []
+        for i, pose_seq in enumerate(pose_sequences_2d):
+            data_info = dict()
+            keypoints_2d = pose_seq.pred_instances.keypoints
+            keypoints_2d = np.squeeze(
+                keypoints_2d,
+                axis=0) if keypoints_2d.ndim == 4 else keypoints_2d
+            T, K, C = keypoints_2d.shape
+            data_info['keypoints'] = keypoints_2d
+            data_info['keypoints_visible'] = np.ones((
+                T,
+                K,
+            ),
+                                                     dtype=np.float32)
+            data_info['lifting_target'] = np.zeros((1, K, 3), dtype=np.float32)
+            data_info['factor'] = np.zeros((T, ), dtype=np.float32)
+            data_info['lifting_target_visible'] = np.ones((1, K, 1),
+                                                          dtype=np.float32)
+            data_info['camera_param'] = dict(w=width, h=height)
+            data_info.update(self.model.dataset_meta)
+            data_info = self.pipeline(data_info)
+            data_info['data_samples'].set_field(
+                img_path, 'img_path', field_type='metainfo')
+            data_list.append(data_info)
+        return data_list
+    @torch.no_grad()
+    def forward(self,
+                inputs: Union[dict, tuple],
+                disable_rebase_keypoint: bool = False):
+        """Perform forward pass through the model and process the results.
+        Args:
+            inputs (Union[dict, tuple]): The inputs for the model.
+            disable_rebase_keypoint (bool, optional): Flag to disable rebasing
+                the height of the keypoints. Defaults to False.
+        Returns:
+            list: A list of data samples, each containing the model's output
+                results.
+        """
+        pose_lift_results = self.model.test_step(inputs)
+        # Post-processing of pose estimation results
+        pose_est_results_converted = self._buffer['pose_est_results_list'][-1]
+        for idx, pose_lift_res in enumerate(pose_lift_results):
+            # Update track_id from the pose estimation results
+            pose_lift_res.track_id = pose_est_results_converted[idx].get(
+                'track_id', 1e4)
+            # align the shape of output keypoints coordinates and scores
+            keypoints = pose_lift_res.pred_instances.keypoints
+            keypoint_scores = pose_lift_res.pred_instances.keypoint_scores
+            if keypoint_scores.ndim == 3:
+                pose_lift_results[idx].pred_instances.keypoint_scores = \
+                    np.squeeze(keypoint_scores, axis=1)
+            if keypoints.ndim == 4:
+                keypoints = np.squeeze(keypoints, axis=1)
+            # Invert x and z values of the keypoints
+            keypoints = keypoints[..., [0, 2, 1]]
+            keypoints[..., 0] = -keypoints[..., 0]
+            keypoints[..., 2] = -keypoints[..., 2]
+            # If rebase_keypoint_height is True, adjust z-axis values
+            if not disable_rebase_keypoint:
+                keypoints[..., 2] -= np.min(
+                    keypoints[..., 2], axis=-1, keepdims=True)
+            pose_lift_results[idx].pred_instances.keypoints = keypoints
+        pose_lift_results = sorted(
+            pose_lift_results, key=lambda x: x.get('track_id', 1e4))
+        data_samples = [merge_data_samples(pose_lift_results)]
+        return data_samples
+    def visualize(self,
+                  inputs: list,
+                  preds: List[PoseDataSample],
+                  return_vis: bool = False,
+                  show: bool = False,
+                  draw_bbox: bool = False,
+                  wait_time: float = 0,
+                  radius: int = 3,
+                  thickness: int = 1,
+                  kpt_thr: float = 0.3,
+                  num_instances: int = 1,
+                  vis_out_dir: str = '',
+                  window_name: str = '',
+                  window_close_event_handler: Optional[Callable] = None
+                  ) -> List[np.ndarray]:
+        """Visualize predictions.
+        Args:
+            inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`.
+            preds (Any): Predictions of the model.
+            return_vis (bool): Whether to return images with predicted results.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (ms). Defaults to 0
+            draw_bbox (bool): Whether to draw the bounding boxes.
+                Defaults to False
+            radius (int): Keypoint radius for visualization. Defaults to 3
+            thickness (int): Link thickness for visualization. Defaults to 1
+            kpt_thr (float): The threshold to visualize the keypoints.
+                Defaults to 0.3
+            vis_out_dir (str, optional): Directory to save visualization
+                results w/o predictions. If left as empty, no file will
+                be saved. Defaults to ''.
+            window_name (str, optional): Title of display window.
+            window_close_event_handler (callable, optional):
+        Returns:
+            List[np.ndarray]: Visualization results.
+        """
+        if (not return_vis) and (not show) and (not vis_out_dir):
+            return
+        if getattr(self, 'visualizer', None) is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+        self.visualizer.radius = radius
+        self.visualizer.line_width = thickness
+        det_kpt_color = self.pose2d_model.visualizer.kpt_color
+        det_dataset_skeleton = self.pose2d_model.visualizer.skeleton
+        det_dataset_link_color = self.pose2d_model.visualizer.link_color
+        self.visualizer.det_kpt_color = det_kpt_color
+        self.visualizer.det_dataset_skeleton = det_dataset_skeleton
+        self.visualizer.det_dataset_link_color = det_dataset_link_color
+        results = []
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                img = mmcv.imread(single_input, channel_order='rgb')
+            elif isinstance(single_input, np.ndarray):
+                img = mmcv.bgr2rgb(single_input)
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(single_input)}')
+            # since visualization and inference utilize the same process,
+            # the wait time is reduced when a video input is utilized,
+            # thereby eliminating the issue of inference getting stuck.
+            wait_time = 1e-5 if self._video_input else wait_time
+            if num_instances < 0:
+                num_instances = len(pred.pred_instances)
+            visualization = self.visualizer.add_datasample(
+                window_name,
+                img,
+                data_sample=pred,
+                det_data_sample=self._buffer['pose2d_results'],
+                draw_gt=False,
+                draw_bbox=draw_bbox,
+                show=show,
+                wait_time=wait_time,
+                dataset_2d=self.pose2d_model.model.
+                dataset_meta['dataset_name'],
+                dataset_3d=self.model.dataset_meta['dataset_name'],
+                kpt_thr=kpt_thr,
+                num_instances=num_instances)
+            results.append(visualization)
+            if vis_out_dir:
+                img_name = os.path.basename(pred.metainfo['img_path']) \
+                    if 'img_path' in pred.metainfo else None
+                self.save_visualization(
+                    visualization,
+                    vis_out_dir,
+                    img_name=img_name,
+                )
+        if return_vis:
+            return results
+        else:
+            return []

mmpose/apis/inferencers/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .default_det_models import default_det_models
+from .get_model_alias import get_model_aliases
+__all__ = ['default_det_models', 'get_model_aliases']

mmpose/apis/inferencers/utils/default_det_models.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from mmengine.config.utils import MODULE2PACKAGE
+from mmengine.utils import get_installed_path
+mmpose_path = get_installed_path(MODULE2PACKAGE['mmpose'])
+default_det_models = dict(
+    human=dict(
+        model=osp.join(
+            mmpose_path, '.mim', 'demo/mmdetection_cfg/'
+            'rtmdet_m_640-8xb32_coco-person.py'),
+        weights='https://download.openmmlab.com/mmpose/v1/projects/'
+        'rtmposev1/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth',
+        cat_ids=(0, )),
+    face=dict(
+        model=osp.join(mmpose_path, '.mim',
+                       'demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py'),
+        weights='https://download.openmmlab.com/mmpose/mmdet_pretrained/'
+        'yolo-x_8xb8-300e_coco-face_13274d7c.pth',
+        cat_ids=(0, )),
+    hand=dict(
+        model=osp.join(mmpose_path, '.mim', 'demo/mmdetection_cfg/'
+                       'rtmdet_nano_320-8xb32_hand.py'),
+        weights='https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/'
+        'rtmdet_nano_8xb32-300e_hand-267f9c8f.pth',
+        cat_ids=(0, )),
+    animal=dict(
+        model='rtmdet-m',
+        weights=None,
+        cat_ids=(15, 16, 17, 18, 19, 20, 21, 22, 23)),
+)
+default_det_models['body'] = default_det_models['human']
+default_det_models['wholebody'] = default_det_models['human']

mmpose/apis/inferencers/utils/get_model_alias.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+from mmengine.infer import BaseInferencer
+def get_model_aliases(scope: str = 'mmpose') -> Dict[str, str]:
+    """Retrieve model aliases and their corresponding configuration names.
+    Args:
+        scope (str, optional): The scope for the model aliases. Defaults
+            to 'mmpose'.
+    Returns:
+        Dict[str, str]: A dictionary containing model aliases as keys and
+            their corresponding configuration names as values.
+    """
+    # Get a list of model configurations from the metafile
+    repo_or_mim_dir = BaseInferencer._get_repo_or_mim_dir(scope)
+    model_cfgs = BaseInferencer._get_models_from_metafile(repo_or_mim_dir)
+    model_alias_dict = dict()
+    for model_cfg in model_cfgs:
+        if 'Alias' in model_cfg:
+            if isinstance(model_cfg['Alias'], str):
+                model_alias_dict[model_cfg['Alias']] = model_cfg['Name']
+            elif isinstance(model_cfg['Alias'], list):
+                for alias in model_cfg['Alias']:
+                    model_alias_dict[alias] = model_cfg['Name']
+            else:
+                raise ValueError(
+                    'encounter an unexpected alias type. Please raise an '
+                    'issue at https://github.com/open-mmlab/mmpose/issues '
+                    'to announce us')
+    return model_alias_dict

mmpose/apis/visualization.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from typing import Union
+import mmcv
+import numpy as np
+from mmengine.structures import InstanceData
+from mmpose.datasets.datasets.utils import parse_pose_metainfo
+from mmpose.structures import PoseDataSample
+from mmpose.visualization import PoseLocalVisualizer
+# from posevis import pose_visualization
+# def visualize(
+#     img: Union[np.ndarray, str],
+#     keypoints: np.ndarray,
+#     keypoint_score: np.ndarray = None,
+#     metainfo: Union[str, dict] = None,
+#     visualizer: PoseLocalVisualizer = None,
+#     show_kpt_idx: bool = False,
+#     skeleton_style: str = 'mmpose',
+#     show: bool = False,
+#     kpt_thr: float = 0.3,
+# ):
+#     """Visualize 2d keypoints on an image.
+#     Args:
+#         img (str | np.ndarray): The image to be displayed.
+#         keypoints (np.ndarray): The keypoint to be displayed.
+#         keypoint_score (np.ndarray): The score of each keypoint.
+#         metainfo (str | dict): The metainfo of dataset.
+#         visualizer (PoseLocalVisualizer): The visualizer.
+#         show_kpt_idx (bool): Whether to show the index of keypoints.
+#         skeleton_style (str): Skeleton style. Options are 'mmpose' and
+#             'openpose'.
+#         show (bool): Whether to show the image.
+#         wait_time (int): Value of waitKey param.
+#         kpt_thr (float): Keypoint threshold.
+#     """
+#     kpts = keypoints.reshape(-1, 2)
+#     kpts = np.concatenate([kpts, keypoint_score[:, None]], axis=1)
+#     kpts[kpts[:, 2] < kpt_thr, :] = 0
+#     pose_results = [{
+#         'keypoints': kpts,
+#     }]
+#     img = pose_visualization(
+#         img,
+#         pose_results,
+#         format="COCO",
+#         greyness=1.0,
+#         show_markers=True,
+#         show_bones=True,
+#         line_type="solid",
+#         width_multiplier=1.0,
+#         bbox_width_multiplier=1.0,
+#         show_bbox=False,
+#         differ_individuals=False,
+#     )
+#     return img
+def visualize(
+    img: Union[np.ndarray, str],
+    keypoints: np.ndarray,
+    keypoint_score: np.ndarray = None,
+    metainfo: Union[str, dict] = None,
+    visualizer: PoseLocalVisualizer = None,
+    show_kpt_idx: bool = False,
+    skeleton_style: str = 'mmpose',
+    show: bool = False,
+    kpt_thr: float = 0.3,
+):
+    """Visualize 2d keypoints on an image.
+    Args:
+        img (str | np.ndarray): The image to be displayed.
+        keypoints (np.ndarray): The keypoint to be displayed.
+        keypoint_score (np.ndarray): The score of each keypoint.
+        metainfo (str | dict): The metainfo of dataset.
+        visualizer (PoseLocalVisualizer): The visualizer.
+        show_kpt_idx (bool): Whether to show the index of keypoints.
+        skeleton_style (str): Skeleton style. Options are 'mmpose' and
+            'openpose'.
+        show (bool): Whether to show the image.
+        wait_time (int): Value of waitKey param.
+        kpt_thr (float): Keypoint threshold.
+    """
+    assert skeleton_style in [
+        'mmpose', 'openpose'
+    ], (f'Only support skeleton style in {["mmpose", "openpose"]}, ')
+    if visualizer is None:
+        visualizer = PoseLocalVisualizer()
+    else:
+        visualizer = deepcopy(visualizer)
+    if isinstance(metainfo, str):
+        metainfo = parse_pose_metainfo(dict(from_file=metainfo))
+    elif isinstance(metainfo, dict):
+        metainfo = parse_pose_metainfo(metainfo)
+    if metainfo is not None:
+        visualizer.set_dataset_meta(metainfo, skeleton_style=skeleton_style)
+    if isinstance(img, str):
+        img = mmcv.imread(img, channel_order='rgb')
+    elif isinstance(img, np.ndarray):
+        img = mmcv.bgr2rgb(img)
+    if keypoint_score is None:
+        keypoint_score = np.ones(keypoints.shape[0])
+    tmp_instances = InstanceData()
+    tmp_instances.keypoints = keypoints
+    tmp_instances.keypoint_score = keypoint_score
+    tmp_datasample = PoseDataSample()
+    tmp_datasample.pred_instances = tmp_instances
+    visualizer.add_datasample(
+        'visualization',
+        img,
+        tmp_datasample,
+        show_kpt_idx=show_kpt_idx,
+        skeleton_style=skeleton_style,
+        show=show,
+        wait_time=0,
+        kpt_thr=kpt_thr)
+    return visualizer.get_image()

mmpose/codecs/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .annotation_processors import YOLOXPoseAnnotationProcessor
+from .associative_embedding import AssociativeEmbedding
+from .decoupled_heatmap import DecoupledHeatmap
+from .edpose_label import EDPoseLabel
+from .hand_3d_heatmap import Hand3DHeatmap
+from .image_pose_lifting import ImagePoseLifting
+from .integral_regression_label import IntegralRegressionLabel
+from .megvii_heatmap import MegviiHeatmap
+from .motionbert_label import MotionBERTLabel
+from .msra_heatmap import MSRAHeatmap
+from .regression_label import RegressionLabel
+from .simcc_label import SimCCLabel
+from .spr import SPR
+from .udp_heatmap import UDPHeatmap
+from .video_pose_lifting import VideoPoseLifting
+from .onehot_heatmap import OneHotHeatmap
+__all__ = [
+    'MSRAHeatmap', 'MegviiHeatmap', 'UDPHeatmap', 'RegressionLabel',
+    'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding', 'SPR',
+    'DecoupledHeatmap', 'VideoPoseLifting', 'ImagePoseLifting',
+    'MotionBERTLabel', 'YOLOXPoseAnnotationProcessor', 'EDPoseLabel',
+    'Hand3DHeatmap', 'OneHotHeatmap'
+]

mmpose/codecs/annotation_processors.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+INF = 1e6
+NEG_INF = -1e6
+class BaseAnnotationProcessor(BaseKeypointCodec):
+    """Base class for annotation processors."""
+    def decode(self, *args, **kwargs):
+        pass
+@KEYPOINT_CODECS.register_module()
+class YOLOXPoseAnnotationProcessor(BaseAnnotationProcessor):
+    """Convert dataset annotations to the input format of YOLOX-Pose.
+    This processor expands bounding boxes and converts category IDs to labels.
+    Args:
+        extend_bbox (bool, optional): Whether to expand the bounding box
+            to include all keypoints. Defaults to False.
+        input_size (tuple, optional): The size of the input image for the
+            model, formatted as (h, w). This argument is necessary for the
+            codec in deployment but is not used indeed.
+    """
+    auxiliary_encode_keys = {'category_id', 'bbox'}
+    label_mapping_table = dict(
+        bbox='bboxes',
+        bbox_labels='labels',
+        keypoints='keypoints',
+        keypoints_visible='keypoints_visible',
+        area='areas',
+    )
+    instance_mapping_table = dict(
+        bbox='bboxes',
+        bbox_score='bbox_scores',
+        keypoints='keypoints',
+        keypoints_visible='keypoints_visible',
+        # remove 'bbox_scales' in default instance_mapping_table to avoid
+        # length mismatch during training with multiple datasets
+    )
+    def __init__(self,
+                 extend_bbox: bool = False,
+                 input_size: Optional[Tuple] = None):
+        super().__init__()
+        self.extend_bbox = extend_bbox
+    def encode(self,
+               keypoints: Optional[np.ndarray] = None,
+               keypoints_visible: Optional[np.ndarray] = None,
+               bbox: Optional[np.ndarray] = None,
+               category_id: Optional[List[int]] = None
+               ) -> Dict[str, np.ndarray]:
+        """Encode keypoints, bounding boxes, and category IDs.
+        Args:
+            keypoints (np.ndarray, optional): Keypoints array. Defaults
+                to None.
+            keypoints_visible (np.ndarray, optional): Visibility array for
+                keypoints. Defaults to None.
+            bbox (np.ndarray, optional): Bounding box array. Defaults to None.
+            category_id (List[int], optional): List of category IDs. Defaults
+                to None.
+        Returns:
+            Dict[str, np.ndarray]: Encoded annotations.
+        """
+        results = {}
+        if self.extend_bbox and bbox is not None:
+            # Handle keypoints visibility
+            if keypoints_visible.ndim == 3:
+                keypoints_visible = keypoints_visible[..., 0]
+            # Expand bounding box to include keypoints
+            kpts_min = keypoints.copy()
+            kpts_min[keypoints_visible == 0] = INF
+            bbox[..., :2] = np.minimum(bbox[..., :2], kpts_min.min(axis=1))
+            kpts_max = keypoints.copy()
+            kpts_max[keypoints_visible == 0] = NEG_INF
+            bbox[..., 2:] = np.maximum(bbox[..., 2:], kpts_max.max(axis=1))
+            results['bbox'] = bbox
+        if category_id is not None:
+            # Convert category IDs to labels
+            bbox_labels = np.array(category_id).astype(np.int8) - 1
+            results['bbox_labels'] = bbox_labels
+        return results

mmpose/codecs/associative_embedding.py ADDED Viewed

	@@ -0,0 +1,522 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import product
+from typing import Any, List, Optional, Tuple
+import numpy as np
+import torch
+from munkres import Munkres
+from torch import Tensor
+from mmpose.registry import KEYPOINT_CODECS
+from mmpose.utils.tensor_utils import to_numpy
+from .base import BaseKeypointCodec
+from .utils import (batch_heatmap_nms, generate_gaussian_heatmaps,
+                    generate_udp_gaussian_heatmaps, refine_keypoints,
+                    refine_keypoints_dark_udp)
+def _py_max_match(scores):
+    """Apply munkres algorithm to get the best match.
+    Args:
+        scores(np.ndarray): cost matrix.
+    Returns:
+        np.ndarray: best match.
+    """
+    m = Munkres()
+    tmp = m.compute(scores)
+    tmp = np.array(tmp).astype(int)
+    return tmp
+def _group_keypoints_by_tags(vals: np.ndarray,
+                             tags: np.ndarray,
+                             locs: np.ndarray,
+                             keypoint_order: List[int],
+                             val_thr: float,
+                             tag_thr: float = 1.0,
+                             max_groups: Optional[int] = None) -> np.ndarray:
+    """Group the keypoints by tags using Munkres algorithm.
+    Note:
+        - keypoint number: K
+        - candidate number: M
+        - tag dimenssion: L
+        - coordinate dimension: D
+        - group number: G
+    Args:
+        vals (np.ndarray): The heatmap response values of keypoints in shape
+            (K, M)
+        tags (np.ndarray): The tags of the keypoint candidates in shape
+            (K, M, L)
+        locs (np.ndarray): The locations of the keypoint candidates in shape
+            (K, M, D)
+        keypoint_order (List[int]): The grouping order of the keypoints.
+            The groupping usually starts from a keypoints around the head and
+            torso, and gruadually moves out to the limbs
+        val_thr (float): The threshold of the keypoint response value
+        tag_thr (float): The maximum allowed tag distance when matching a
+            keypoint to a group. A keypoint with larger tag distance to any
+            of the existing groups will initializes a new group
+        max_groups (int, optional): The maximum group number. ``None`` means
+            no limitation. Defaults to ``None``
+    Returns:
+        np.ndarray: grouped keypoints in shape (G, K, D+1), where the last
+        dimenssion is the concatenated keypoint coordinates and scores.
+    """
+    tag_k, loc_k, val_k = tags, locs, vals
+    K, M, D = locs.shape
+    assert vals.shape == tags.shape[:2] == (K, M)
+    assert len(keypoint_order) == K
+    default_ = np.zeros((K, 3 + tag_k.shape[2]), dtype=np.float32)
+    joint_dict = {}
+    tag_dict = {}
+    for i in range(K):
+        idx = keypoint_order[i]
+        tags = tag_k[idx]
+        joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1)
+        mask = joints[:, 2] > val_thr
+        tags = tags[mask]  # shape: [M, L]
+        joints = joints[mask]  # shape: [M, 3 + L], 3: x, y, val
+        if joints.shape[0] == 0:
+            continue
+        if i == 0 or len(joint_dict) == 0:
+            for tag, joint in zip(tags, joints):
+                key = tag[0]
+                joint_dict.setdefault(key, np.copy(default_))[idx] = joint
+                tag_dict[key] = [tag]
+        else:
+            # shape: [M]
+            grouped_keys = list(joint_dict.keys())
+            # shape: [M, L]
+            grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys]
+            # shape: [M, M, L]
+            diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :]
+            # shape: [M, M]
+            diff_normed = np.linalg.norm(diff, ord=2, axis=2)
+            diff_saved = np.copy(diff_normed)
+            diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3]
+            num_added = diff.shape[0]
+            num_grouped = diff.shape[1]
+            if num_added > num_grouped:
+                diff_normed = np.concatenate(
+                    (diff_normed,
+                     np.zeros((num_added, num_added - num_grouped),
+                              dtype=np.float32) + 1e10),
+                    axis=1)
+            pairs = _py_max_match(diff_normed)
+            for row, col in pairs:
+                if (row < num_added and col < num_grouped
+                        and diff_saved[row][col] < tag_thr):
+                    key = grouped_keys[col]
+                    joint_dict[key][idx] = joints[row]
+                    tag_dict[key].append(tags[row])
+                else:
+                    key = tags[row][0]
+                    joint_dict.setdefault(key, np.copy(default_))[idx] = \
+                        joints[row]
+                    tag_dict[key] = [tags[row]]
+    joint_dict_keys = list(joint_dict.keys())[:max_groups]
+    if joint_dict_keys:
+        results = np.array([joint_dict[i]
+                            for i in joint_dict_keys]).astype(np.float32)
+        results = results[..., :D + 1]
+    else:
+        results = np.empty((0, K, D + 1), dtype=np.float32)
+    return results
+@KEYPOINT_CODECS.register_module()
+class AssociativeEmbedding(BaseKeypointCodec):
+    """Encode/decode keypoints with the method introduced in "Associative
+    Embedding". This is an asymmetric codec, where the keypoints are
+    represented as gaussian heatmaps and position indices during encoding, and
+    restored from predicted heatmaps and group tags.
+    See the paper `Associative Embedding: End-to-End Learning for Joint
+    Detection and Grouping`_ by Newell et al (2017) for details
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - embedding tag dimension: L
+        - image size: [w, h]
+        - heatmap size: [W, H]
+    Encoded:
+        - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W)
+            where [W, H] is the `heatmap_size`
+        - keypoint_indices (np.ndarray): The keypoint position indices in shape
+            (N, K, 2). Each keypoint's index is [i, v], where i is the position
+            index in the heatmap (:math:`i=y*w+x`) and v is the visibility
+        - keypoint_weights (np.ndarray): The target weights in shape (N, K)
+    Args:
+        input_size (tuple): Image size in [w, h]
+        heatmap_size (tuple): Heatmap size in [W, H]
+        sigma (float): The sigma value of the Gaussian heatmap
+        use_udp (bool): Whether use unbiased data processing. See
+            `UDP (CVPR 2020)`_ for details. Defaults to ``False``
+        decode_keypoint_order (List[int]): The grouping order of the
+            keypoint indices. The groupping usually starts from a keypoints
+            around the head and torso, and gruadually moves out to the limbs
+        decode_keypoint_thr (float): The threshold of keypoint response value
+            in heatmaps. Defaults to 0.1
+        decode_tag_thr (float): The maximum allowed tag distance when matching
+            a keypoint to a group. A keypoint with larger tag distance to any
+            of the existing groups will initializes a new group. Defaults to
+            1.0
+        decode_nms_kernel (int): The kernel size of the NMS during decoding,
+            which should be an odd integer. Defaults to 5
+        decode_gaussian_kernel (int): The kernel size of the Gaussian blur
+            during decoding, which should be an odd integer. It is only used
+            when ``self.use_udp==True``. Defaults to 3
+        decode_topk (int): The number top-k candidates of each keypoints that
+            will be retrieved from the heatmaps during dedocding. Defaults to
+            20
+        decode_max_instances (int, optional): The maximum number of instances
+            to decode. ``None`` means no limitation to the instance number.
+            Defaults to ``None``
+    .. _`Associative Embedding: End-to-End Learning for Joint Detection and
+    Grouping`: https://arxiv.org/abs/1611.05424
+    .. _`UDP (CVPR 2020)`: https://arxiv.org/abs/1911.07524
+    """
+    def __init__(
+        self,
+        input_size: Tuple[int, int],
+        heatmap_size: Tuple[int, int],
+        sigma: Optional[float] = None,
+        use_udp: bool = False,
+        decode_keypoint_order: List[int] = [],
+        decode_nms_kernel: int = 5,
+        decode_gaussian_kernel: int = 3,
+        decode_keypoint_thr: float = 0.1,
+        decode_tag_thr: float = 1.0,
+        decode_topk: int = 30,
+        decode_center_shift=0.0,
+        decode_max_instances: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.input_size = input_size
+        self.heatmap_size = heatmap_size
+        self.use_udp = use_udp
+        self.decode_nms_kernel = decode_nms_kernel
+        self.decode_gaussian_kernel = decode_gaussian_kernel
+        self.decode_keypoint_thr = decode_keypoint_thr
+        self.decode_tag_thr = decode_tag_thr
+        self.decode_topk = decode_topk
+        self.decode_center_shift = decode_center_shift
+        self.decode_max_instances = decode_max_instances
+        self.decode_keypoint_order = decode_keypoint_order.copy()
+        if self.use_udp:
+            self.scale_factor = ((np.array(input_size) - 1) /
+                                 (np.array(heatmap_size) - 1)).astype(
+                                     np.float32)
+        else:
+            self.scale_factor = (np.array(input_size) /
+                                 heatmap_size).astype(np.float32)
+        if sigma is None:
+            sigma = (heatmap_size[0] * heatmap_size[1])**0.5 / 64
+        self.sigma = sigma
+    def encode(
+        self,
+        keypoints: np.ndarray,
+        keypoints_visible: Optional[np.ndarray] = None
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Encode keypoints into heatmaps and position indices. Note that the
+        original keypoint coordinates should be in the input image space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+        Returns:
+            dict:
+            - heatmaps (np.ndarray): The generated heatmap in shape
+                (K, H, W) where [W, H] is the `heatmap_size`
+            - keypoint_indices (np.ndarray): The keypoint position indices
+                in shape (N, K, 2). Each keypoint's index is [i, v], where i
+                is the position index in the heatmap (:math:`i=y*w+x`) and v
+                is the visibility
+            - keypoint_weights (np.ndarray): The target weights in shape
+                (N, K)
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        # keypoint coordinates in heatmap
+        _keypoints = keypoints / self.scale_factor
+        if self.use_udp:
+            heatmaps, keypoint_weights = generate_udp_gaussian_heatmaps(
+                heatmap_size=self.heatmap_size,
+                keypoints=_keypoints,
+                keypoints_visible=keypoints_visible,
+                sigma=self.sigma)
+        else:
+            heatmaps, keypoint_weights = generate_gaussian_heatmaps(
+                heatmap_size=self.heatmap_size,
+                keypoints=_keypoints,
+                keypoints_visible=keypoints_visible,
+                sigma=self.sigma)
+        keypoint_indices = self._encode_keypoint_indices(
+            heatmap_size=self.heatmap_size,
+            keypoints=_keypoints,
+            keypoints_visible=keypoints_visible)
+        encoded = dict(
+            heatmaps=heatmaps,
+            keypoint_indices=keypoint_indices,
+            keypoint_weights=keypoint_weights)
+        return encoded
+    def _encode_keypoint_indices(self, heatmap_size: Tuple[int, int],
+                                 keypoints: np.ndarray,
+                                 keypoints_visible: np.ndarray) -> np.ndarray:
+        w, h = heatmap_size
+        N, K, _ = keypoints.shape
+        keypoint_indices = np.zeros((N, K, 2), dtype=np.int64)
+        for n, k in product(range(N), range(K)):
+            x, y = (keypoints[n, k] + 0.5).astype(np.int64)
+            index = y * w + x
+            vis = (keypoints_visible[n, k] > 0.5 and 0 <= x < w and 0 <= y < h)
+            keypoint_indices[n, k] = [index, vis]
+        return keypoint_indices
+    def decode(self, encoded: Any) -> Tuple[np.ndarray, np.ndarray]:
+        raise NotImplementedError()
+    def _get_batch_topk(self, batch_heatmaps: Tensor, batch_tags: Tensor,
+                        k: int):
+        """Get top-k response values from the heatmaps and corresponding tag
+        values from the tagging heatmaps.
+        Args:
+            batch_heatmaps (Tensor): Keypoint detection heatmaps in shape
+                (B, K, H, W)
+            batch_tags (Tensor): Tagging heatmaps in shape (B, C, H, W), where
+                the tag dim C is 2*K when using flip testing, or K otherwise
+            k (int): The number of top responses to get
+        Returns:
+            tuple:
+            - topk_vals (Tensor): Top-k response values of each heatmap in
+                shape (B, K, Topk)
+            - topk_tags (Tensor): The corresponding embedding tags of the
+                top-k responses, in shape (B, K, Topk, L)
+            - topk_locs (Tensor): The location of the top-k responses in each
+                heatmap, in shape (B, K, Topk, 2) where last dimension
+                represents x and y coordinates
+        """
+        B, K, H, W = batch_heatmaps.shape
+        L = batch_tags.shape[1] // K
+        # shape of topk_val, top_indices: (B, K, TopK)
+        topk_vals, topk_indices = batch_heatmaps.flatten(-2, -1).topk(
+            k, dim=-1)
+        topk_tags_per_kpts = [
+            torch.gather(_tag, dim=2, index=topk_indices)
+            for _tag in torch.unbind(batch_tags.view(B, L, K, H * W), dim=1)
+        ]
+        topk_tags = torch.stack(topk_tags_per_kpts, dim=-1)  # (B, K, TopK, L)
+        topk_locs = torch.stack([topk_indices % W, topk_indices // W],
+                                dim=-1)  # (B, K, TopK, 2)
+        return topk_vals, topk_tags, topk_locs
+    def _group_keypoints(self, batch_vals: np.ndarray, batch_tags: np.ndarray,
+                         batch_locs: np.ndarray):
+        """Group keypoints into groups (each represents an instance) by tags.
+        Args:
+            batch_vals (Tensor): Heatmap response values of keypoint
+                candidates in shape (B, K, Topk)
+            batch_tags (Tensor): Tags of keypoint candidates in shape
+                (B, K, Topk, L)
+            batch_locs (Tensor): Locations of keypoint candidates in shape
+                (B, K, Topk, 2)
+        Returns:
+            List[np.ndarray]: Grouping results of a batch, each element is a
+            np.ndarray (in shape [N, K, D+1]) that contains the groups
+            detected in an image, including both keypoint coordinates and
+            scores.
+        """
+        def _group_func(inputs: Tuple):
+            vals, tags, locs = inputs
+            return _group_keypoints_by_tags(
+                vals,
+                tags,
+                locs,
+                keypoint_order=self.decode_keypoint_order,
+                val_thr=self.decode_keypoint_thr,
+                tag_thr=self.decode_tag_thr,
+                max_groups=self.decode_max_instances)
+        _results = map(_group_func, zip(batch_vals, batch_tags, batch_locs))
+        results = list(_results)
+        return results
+    def _fill_missing_keypoints(self, keypoints: np.ndarray,
+                                keypoint_scores: np.ndarray,
+                                heatmaps: np.ndarray, tags: np.ndarray):
+        """Fill the missing keypoints in the initial predictions.
+        Args:
+            keypoints (np.ndarray): Keypoint predictions in shape (N, K, D)
+            keypoint_scores (np.ndarray): Keypint score predictions in shape
+                (N, K), in which 0 means the corresponding keypoint is
+                missing in the initial prediction
+            heatmaps (np.ndarry): Heatmaps in shape (K, H, W)
+            tags (np.ndarray): Tagging heatmaps in shape (C, H, W) where
+                C=L*K
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Keypoint predictions with missing
+                ones filled
+            - keypoint_scores (np.ndarray): Keypoint score predictions with
+                missing ones filled
+        """
+        N, K = keypoints.shape[:2]
+        H, W = heatmaps.shape[1:]
+        L = tags.shape[0] // K
+        keypoint_tags = [tags[k::K] for k in range(K)]
+        for n in range(N):
+            # Calculate the instance tag (mean tag of detected keypoints)
+            _tag = []
+            for k in range(K):
+                if keypoint_scores[n, k] > 0:
+                    x, y = keypoints[n, k, :2].astype(np.int64)
+                    x = np.clip(x, 0, W - 1)
+                    y = np.clip(y, 0, H - 1)
+                    _tag.append(keypoint_tags[k][:, y, x])
+            tag = np.mean(_tag, axis=0)
+            tag = tag.reshape(L, 1, 1)
+            # Search maximum response of the missing keypoints
+            for k in range(K):
+                if keypoint_scores[n, k] > 0:
+                    continue
+                dist_map = np.linalg.norm(
+                    keypoint_tags[k] - tag, ord=2, axis=0)
+                cost_map = np.round(dist_map) * 100 - heatmaps[k]  # H, W
+                y, x = np.unravel_index(np.argmin(cost_map), shape=(H, W))
+                keypoints[n, k] = [x, y]
+                keypoint_scores[n, k] = heatmaps[k, y, x]
+        return keypoints, keypoint_scores
+    def batch_decode(self, batch_heatmaps: Tensor, batch_tags: Tensor
+                     ) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+        """Decode the keypoint coordinates from a batch of heatmaps and tagging
+        heatmaps. The decoded keypoint coordinates are in the input image
+        space.
+        Args:
+            batch_heatmaps (Tensor): Keypoint detection heatmaps in shape
+                (B, K, H, W)
+            batch_tags (Tensor): Tagging heatmaps in shape (B, C, H, W), where
+                :math:`C=L*K`
+        Returns:
+            tuple:
+            - batch_keypoints (List[np.ndarray]): Decoded keypoint coordinates
+                of the batch, each is in shape (N, K, D)
+            - batch_scores (List[np.ndarray]): Decoded keypoint scores of the
+                batch, each is in shape (N, K). It usually represents the
+                confidience of the keypoint prediction
+        """
+        B, _, H, W = batch_heatmaps.shape
+        assert batch_tags.shape[0] == B and batch_tags.shape[2:4] == (H, W), (
+            f'Mismatched shapes of heatmap ({batch_heatmaps.shape}) and '
+            f'tagging map ({batch_tags.shape})')
+        # Heatmap NMS
+        batch_heatmaps_peak = batch_heatmap_nms(batch_heatmaps,
+                                                self.decode_nms_kernel)
+        # Get top-k in each heatmap and and convert to numpy
+        batch_topk_vals, batch_topk_tags, batch_topk_locs = to_numpy(
+            self._get_batch_topk(
+                batch_heatmaps_peak, batch_tags, k=self.decode_topk))
+        # Group keypoint candidates into groups (instances)
+        batch_groups = self._group_keypoints(batch_topk_vals, batch_topk_tags,
+                                             batch_topk_locs)
+        # Convert to numpy
+        batch_heatmaps_np = to_numpy(batch_heatmaps)
+        batch_tags_np = to_numpy(batch_tags)
+        # Refine the keypoint prediction
+        batch_keypoints = []
+        batch_keypoint_scores = []
+        batch_instance_scores = []
+        for i, (groups, heatmaps, tags) in enumerate(
+                zip(batch_groups, batch_heatmaps_np, batch_tags_np)):
+            keypoints, scores = groups[..., :-1], groups[..., -1]
+            instance_scores = scores.mean(axis=-1)
+            if keypoints.size > 0:
+                # refine keypoint coordinates according to heatmap distribution
+                if self.use_udp:
+                    keypoints = refine_keypoints_dark_udp(
+                        keypoints,
+                        heatmaps,
+                        blur_kernel_size=self.decode_gaussian_kernel)
+                else:
+                    keypoints = refine_keypoints(keypoints, heatmaps)
+                keypoints += self.decode_center_shift * \
+                    (scores > 0).astype(keypoints.dtype)[..., None]
+                # identify missing keypoints
+                keypoints, scores = self._fill_missing_keypoints(
+                    keypoints, scores, heatmaps, tags)
+            batch_keypoints.append(keypoints)
+            batch_keypoint_scores.append(scores)
+            batch_instance_scores.append(instance_scores)
+        # restore keypoint scale
+        batch_keypoints = [
+            kpts * self.scale_factor for kpts in batch_keypoints
+        ]
+        return batch_keypoints, batch_keypoint_scores, batch_instance_scores

mmpose/codecs/base.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Any, List, Optional, Tuple
+import numpy as np
+from mmengine.utils import is_method_overridden
+class BaseKeypointCodec(metaclass=ABCMeta):
+    """The base class of the keypoint codec.
+    A keypoint codec is a module to encode keypoint coordinates to specific
+    representation (e.g. heatmap) and vice versa. A subclass should implement
+    the methods :meth:`encode` and :meth:`decode`.
+    """
+    # pass additional encoding arguments to the `encode` method, beyond the
+    # mandatory `keypoints` and `keypoints_visible` arguments.
+    auxiliary_encode_keys = set()
+    field_mapping_table = dict()
+    instance_mapping_table = dict()
+    label_mapping_table = dict()
+    @abstractmethod
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None) -> dict:
+        """Encode keypoints.
+        Note:
+            - instance number: N
+            - keypoint number: K
+            - keypoint dimension: D
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibility in shape
+                (N, K, D)
+        Returns:
+            dict: Encoded items.
+        """
+    @abstractmethod
+    def decode(self, encoded: Any) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoints.
+        Args:
+            encoded (any): Encoded keypoint representation using the codec
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            - keypoints_visible (np.ndarray): Keypoint visibility in shape
+                (N, K, D)
+        """
+    def batch_decode(self, batch_encoded: Any
+                     ) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+        """Decode keypoints.
+        Args:
+            batch_encoded (any): A batch of encoded keypoint
+                representations
+        Returns:
+            tuple:
+            - batch_keypoints (List[np.ndarray]): Each element is keypoint
+                coordinates in shape (N, K, D)
+            - batch_keypoints (List[np.ndarray]): Each element is keypoint
+                visibility in shape (N, K)
+        """
+        raise NotImplementedError()
+    @property
+    def support_batch_decoding(self) -> bool:
+        """Return whether the codec support decoding from batch data."""
+        return is_method_overridden('batch_decode', BaseKeypointCodec,
+                                    self.__class__)

mmpose/codecs/decoupled_heatmap.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from typing import Optional, Tuple
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils import (generate_gaussian_heatmaps, get_diagonal_lengths,
+                    get_instance_bbox, get_instance_root)
+from .utils.post_processing import get_heatmap_maximum
+from .utils.refinement import refine_keypoints
+@KEYPOINT_CODECS.register_module()
+class DecoupledHeatmap(BaseKeypointCodec):
+    """Encode/decode keypoints with the method introduced in the paper CID.
+    See the paper Contextual Instance Decoupling for Robust Multi-Person
+    Pose Estimation`_ by Wang et al (2022) for details
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+        - heatmap size: [W, H]
+    Encoded:
+        - heatmaps (np.ndarray): The coupled heatmap in shape
+            (1+K, H, W) where [W, H] is the `heatmap_size`.
+        - instance_heatmaps (np.ndarray): The decoupled heatmap in shape
+            (M*K, H, W) where M is the number of instances.
+        - keypoint_weights (np.ndarray): The weight for heatmaps in shape
+            (M*K).
+        - instance_coords (np.ndarray): The coordinates of instance roots
+            in shape (M, 2)
+    Args:
+        input_size (tuple): Image size in [w, h]
+        heatmap_size (tuple): Heatmap size in [W, H]
+        root_type (str): The method to generate the instance root. Options
+            are:
+            - ``'kpt_center'``: Average coordinate of all visible keypoints.
+            - ``'bbox_center'``: Center point of bounding boxes outlined by
+                all visible keypoints.
+            Defaults to ``'kpt_center'``
+        heatmap_min_overlap (float): Minimum overlap rate among instances.
+            Used when calculating sigmas for instances. Defaults to 0.7
+        background_weight (float): Loss weight of background pixels.
+            Defaults to 0.1
+        encode_max_instances (int): The maximum number of instances
+            to encode for each sample. Defaults to 30
+    .. _`CID`: https://openaccess.thecvf.com/content/CVPR2022/html/Wang_
+    Contextual_Instance_Decoupling_for_Robust_Multi-Person_Pose_Estimation_
+    CVPR_2022_paper.html
+    """
+    # DecoupledHeatmap requires bounding boxes to determine the size of each
+    # instance, so that it can assign varying sigmas based on their size
+    auxiliary_encode_keys = {'bbox'}
+    label_mapping_table = dict(
+        keypoint_weights='keypoint_weights',
+        instance_coords='instance_coords',
+    )
+    field_mapping_table = dict(
+        heatmaps='heatmaps',
+        instance_heatmaps='instance_heatmaps',
+    )
+    def __init__(
+        self,
+        input_size: Tuple[int, int],
+        heatmap_size: Tuple[int, int],
+        root_type: str = 'kpt_center',
+        heatmap_min_overlap: float = 0.7,
+        encode_max_instances: int = 30,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.heatmap_size = heatmap_size
+        self.root_type = root_type
+        self.encode_max_instances = encode_max_instances
+        self.heatmap_min_overlap = heatmap_min_overlap
+        self.scale_factor = (np.array(input_size) /
+                             heatmap_size).astype(np.float32)
+    def _get_instance_wise_sigmas(
+        self,
+        bbox: np.ndarray,
+    ) -> np.ndarray:
+        """Get sigma values for each instance according to their size.
+        Args:
+            bbox (np.ndarray): Bounding box in shape (N, 4, 2)
+        Returns:
+            np.ndarray: Array containing the sigma values for each instance.
+        """
+        sigmas = np.zeros((bbox.shape[0], ), dtype=np.float32)
+        heights = np.sqrt(np.power(bbox[:, 0] - bbox[:, 1], 2).sum(axis=-1))
+        widths = np.sqrt(np.power(bbox[:, 0] - bbox[:, 2], 2).sum(axis=-1))
+        for i in range(bbox.shape[0]):
+            h, w = heights[i], widths[i]
+            # compute sigma for each instance
+            # condition 1
+            a1, b1 = 1, h + w
+            c1 = w * h * (1 - self.heatmap_min_overlap) / (
+                1 + self.heatmap_min_overlap)
+            sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
+            r1 = (b1 + sq1) / 2
+            # condition 2
+            a2 = 4
+            b2 = 2 * (h + w)
+            c2 = (1 - self.heatmap_min_overlap) * w * h
+            sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
+            r2 = (b2 + sq2) / 2
+            # condition 3
+            a3 = 4 * self.heatmap_min_overlap
+            b3 = -2 * self.heatmap_min_overlap * (h + w)
+            c3 = (self.heatmap_min_overlap - 1) * w * h
+            sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
+            r3 = (b3 + sq3) / 2
+            sigmas[i] = min(r1, r2, r3) / 3
+        return sigmas
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None,
+               bbox: Optional[np.ndarray] = None) -> dict:
+        """Encode keypoints into heatmaps.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+            bbox (np.ndarray): Bounding box in shape (N, 8) which includes
+                coordinates of 4 corners.
+        Returns:
+            dict:
+            - heatmaps (np.ndarray): The coupled heatmap in shape
+                (1+K, H, W) where [W, H] is the `heatmap_size`.
+            - instance_heatmaps (np.ndarray): The decoupled heatmap in shape
+                (N*K, H, W) where M is the number of instances.
+            - keypoint_weights (np.ndarray): The weight for heatmaps in shape
+                (N*K).
+            - instance_coords (np.ndarray): The coordinates of instance roots
+                in shape (N, 2)
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        if bbox is None:
+            # generate pseudo bbox via visible keypoints
+            bbox = get_instance_bbox(keypoints, keypoints_visible)
+            bbox = np.tile(bbox, 2).reshape(-1, 4, 2)
+            # corner order: left_top, left_bottom, right_top, right_bottom
+            bbox[:, 1:3, 0] = bbox[:, 0:2, 0]
+        # keypoint coordinates in heatmap
+        _keypoints = keypoints / self.scale_factor
+        _bbox = bbox.reshape(-1, 4, 2) / self.scale_factor
+        # compute the root and scale of each instance
+        roots, roots_visible = get_instance_root(_keypoints, keypoints_visible,
+                                                 self.root_type)
+        sigmas = self._get_instance_wise_sigmas(_bbox)
+        # generate global heatmaps
+        heatmaps, keypoint_weights = generate_gaussian_heatmaps(
+            heatmap_size=self.heatmap_size,
+            keypoints=np.concatenate((_keypoints, roots[:, None]), axis=1),
+            keypoints_visible=np.concatenate(
+                (keypoints_visible, roots_visible[:, None]), axis=1),
+            sigma=sigmas)
+        roots_visible = keypoint_weights[:, -1]
+        # select instances
+        inst_roots, inst_indices = [], []
+        diagonal_lengths = get_diagonal_lengths(_keypoints, keypoints_visible)
+        for i in np.argsort(diagonal_lengths):
+            if roots_visible[i] < 1:
+                continue
+            # rand root point in 3x3 grid
+            x, y = roots[i] + np.random.randint(-1, 2, (2, ))
+            x = max(0, min(x, self.heatmap_size[0] - 1))
+            y = max(0, min(y, self.heatmap_size[1] - 1))
+            if (x, y) not in inst_roots:
+                inst_roots.append((x, y))
+                inst_indices.append(i)
+        if len(inst_indices) > self.encode_max_instances:
+            rand_indices = random.sample(
+                range(len(inst_indices)), self.encode_max_instances)
+            inst_roots = [inst_roots[i] for i in rand_indices]
+            inst_indices = [inst_indices[i] for i in rand_indices]
+        # generate instance-wise heatmaps
+        inst_heatmaps, inst_heatmap_weights = [], []
+        for i in inst_indices:
+            inst_heatmap, inst_heatmap_weight = generate_gaussian_heatmaps(
+                heatmap_size=self.heatmap_size,
+                keypoints=_keypoints[i:i + 1],
+                keypoints_visible=keypoints_visible[i:i + 1],
+                sigma=sigmas[i].item())
+            inst_heatmaps.append(inst_heatmap)
+            inst_heatmap_weights.append(inst_heatmap_weight)
+        if len(inst_indices) > 0:
+            inst_heatmaps = np.concatenate(inst_heatmaps)
+            inst_heatmap_weights = np.concatenate(inst_heatmap_weights)
+            inst_roots = np.array(inst_roots, dtype=np.int32)
+        else:
+            inst_heatmaps = np.empty((0, *self.heatmap_size[::-1]))
+            inst_heatmap_weights = np.empty((0, ))
+            inst_roots = np.empty((0, 2), dtype=np.int32)
+        encoded = dict(
+            heatmaps=heatmaps,
+            instance_heatmaps=inst_heatmaps,
+            keypoint_weights=inst_heatmap_weights,
+            instance_coords=inst_roots)
+        return encoded
+    def decode(self, instance_heatmaps: np.ndarray,
+               instance_scores: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from decoupled heatmaps. The decoded
+        keypoint coordinates are in the input image space.
+        Args:
+            instance_heatmaps (np.ndarray): Heatmaps in shape (N, K, H, W)
+            instance_scores (np.ndarray): Confidence of instance roots
+                prediction in shape (N, 1)
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded keypoint coordinates in shape
+                (N, K, D)
+            - scores (np.ndarray): The keypoint scores in shape (N, K). It
+                usually represents the confidence of the keypoint prediction
+        """
+        keypoints, keypoint_scores = [], []
+        for i in range(instance_heatmaps.shape[0]):
+            heatmaps = instance_heatmaps[i].copy()
+            kpts, scores = get_heatmap_maximum(heatmaps)
+            keypoints.append(refine_keypoints(kpts[None], heatmaps))
+            keypoint_scores.append(scores[None])
+        keypoints = np.concatenate(keypoints)
+        # Restore the keypoint scale
+        keypoints = keypoints * self.scale_factor
+        keypoint_scores = np.concatenate(keypoint_scores)
+        keypoint_scores *= instance_scores
+        return keypoints, keypoint_scores

mmpose/codecs/edpose_label.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from mmpose.structures import bbox_cs2xyxy, bbox_xyxy2cs
+from .base import BaseKeypointCodec
+@KEYPOINT_CODECS.register_module()
+class EDPoseLabel(BaseKeypointCodec):
+    r"""Generate keypoint and label coordinates for `ED-Pose`_ by
+    Yang J. et al (2023).
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+    Encoded:
+        - keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        - keypoints_visible (np.ndarray): Keypoint visibility in shape
+                (N, K, D)
+        - area (np.ndarray): Area in shape (N)
+        - bbox (np.ndarray): Bbox in shape (N, 4)
+    Args:
+        num_select (int): The number of candidate instances
+        num_keypoints (int): The Number of keypoints
+    """
+    auxiliary_encode_keys = {'area', 'bboxes', 'img_shape'}
+    instance_mapping_table = dict(
+        bbox='bboxes',
+        keypoints='keypoints',
+        keypoints_visible='keypoints_visible',
+        area='areas',
+    )
+    def __init__(self, num_select: int = 100, num_keypoints: int = 17):
+        super().__init__()
+        self.num_select = num_select
+        self.num_keypoints = num_keypoints
+    def encode(
+        self,
+        img_shape,
+        keypoints: np.ndarray,
+        keypoints_visible: Optional[np.ndarray] = None,
+        area: Optional[np.ndarray] = None,
+        bboxes: Optional[np.ndarray] = None,
+    ) -> dict:
+        """Encoding keypoints, area and bbox from input image space to
+        normalized space.
+        Args:
+            - img_shape (Sequence[int]): The shape of image in the format
+                of (width, height).
+            - keypoints (np.ndarray): Keypoint coordinates in
+                shape (N, K, D).
+            - keypoints_visible (np.ndarray): Keypoint visibility in shape
+                (N, K)
+            - area (np.ndarray):
+            - bboxes (np.ndarray):
+        Returns:
+            encoded (dict): Contains the following items:
+                - keypoint_labels (np.ndarray): The processed keypoints in
+                    shape like (N, K, D).
+                - keypoints_visible (np.ndarray): Keypoint visibility in shape
+                    (N, K, D)
+                - area_labels (np.ndarray): The processed target
+                    area in shape (N).
+                - bboxes_labels: The processed target bbox in
+                    shape (N, 4).
+        """
+        w, h = img_shape
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        if bboxes is not None:
+            bboxes = np.concatenate(bbox_xyxy2cs(bboxes), axis=-1)
+            bboxes = bboxes / np.array([w, h, w, h], dtype=np.float32)
+        if area is not None:
+            area = area / float(w * h)
+        if keypoints is not None:
+            keypoints = keypoints / np.array([w, h], dtype=np.float32)
+        encoded = dict(
+            keypoints=keypoints,
+            area=area,
+            bbox=bboxes,
+            keypoints_visible=keypoints_visible)
+        return encoded
+    def decode(self, input_shapes: np.ndarray, pred_logits: np.ndarray,
+               pred_boxes: np.ndarray, pred_keypoints: np.ndarray):
+        """Select the final top-k keypoints, and decode the results from
+        normalize size to origin input size.
+        Args:
+            input_shapes (Tensor): The size of input image resize.
+            test_cfg (ConfigType): Config of testing.
+            pred_logits (Tensor): The result of score.
+            pred_boxes (Tensor): The result of bbox.
+            pred_keypoints (Tensor): The result of keypoints.
+        Returns:
+            tuple: Decoded boxes, keypoints, and keypoint scores.
+        """
+        # Initialization
+        num_keypoints = self.num_keypoints
+        prob = pred_logits.reshape(-1)
+        # Select top-k instances based on prediction scores
+        topk_indexes = np.argsort(-prob)[:self.num_select]
+        topk_values = np.take_along_axis(prob, topk_indexes, axis=0)
+        scores = np.tile(topk_values[:, np.newaxis], [1, num_keypoints])
+        # Decode bounding boxes
+        topk_boxes = topk_indexes // pred_logits.shape[1]
+        boxes = bbox_cs2xyxy(*np.split(pred_boxes, [2], axis=-1))
+        boxes = np.take_along_axis(
+            boxes, np.tile(topk_boxes[:, np.newaxis], [1, 4]), axis=0)
+        # Convert from relative to absolute coordinates
+        img_h, img_w = np.split(input_shapes, 2, axis=0)
+        scale_fct = np.hstack([img_w, img_h, img_w, img_h])
+        boxes = boxes * scale_fct[np.newaxis, :]
+        # Decode keypoints
+        topk_keypoints = topk_indexes // pred_logits.shape[1]
+        keypoints = np.take_along_axis(
+            pred_keypoints,
+            np.tile(topk_keypoints[:, np.newaxis], [1, num_keypoints * 3]),
+            axis=0)
+        keypoints = keypoints[:, :(num_keypoints * 2)]
+        keypoints = keypoints * np.tile(
+            np.hstack([img_w, img_h]), [num_keypoints])[np.newaxis, :]
+        keypoints = keypoints.reshape(-1, num_keypoints, 2)
+        return boxes, keypoints, scores

mmpose/codecs/hand_3d_heatmap.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils.gaussian_heatmap import generate_3d_gaussian_heatmaps
+from .utils.post_processing import get_heatmap_3d_maximum
+@KEYPOINT_CODECS.register_module()
+class Hand3DHeatmap(BaseKeypointCodec):
+    r"""Generate target 3d heatmap and relative root depth for hand datasets.
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+    Args:
+        image_size (tuple): Size of image. Default: ``[256, 256]``.
+        root_heatmap_size (int): Size of heatmap of root head.
+            Default: 64.
+        heatmap_size (tuple): Size of heatmap. Default: ``[64, 64, 64]``.
+        heatmap3d_depth_bound (float): Boundary for 3d heatmap depth.
+            Default: 400.0.
+        heatmap_size_root (int): Size of 3d heatmap root. Default: 64.
+        depth_size (int): Number of depth discretization size, used for
+            decoding. Defaults to 64.
+        root_depth_bound (float): Boundary for 3d heatmap root depth.
+            Default: 400.0.
+        use_different_joint_weights (bool): Whether to use different joint
+            weights. Default: ``False``.
+        sigma (int): Sigma of heatmap gaussian. Default: 2.
+        joint_indices (list, optional): Indices of joints used for heatmap
+            generation. If None (default) is given, all joints will be used.
+            Default: ``None``.
+        max_bound (float): The maximal value of heatmap. Default: 1.0.
+    """
+    auxiliary_encode_keys = {
+        'dataset_keypoint_weights', 'rel_root_depth', 'rel_root_valid',
+        'hand_type', 'hand_type_valid', 'focal', 'principal_pt'
+    }
+    instance_mapping_table = {
+        'keypoints': 'keypoints',
+        'keypoints_visible': 'keypoints_visible',
+        'keypoints_cam': 'keypoints_cam',
+    }
+    label_mapping_table = {
+        'keypoint_weights': 'keypoint_weights',
+        'root_depth_weight': 'root_depth_weight',
+        'type_weight': 'type_weight',
+        'root_depth': 'root_depth',
+        'type': 'type'
+    }
+    def __init__(self,
+                 image_size: Tuple[int, int] = [256, 256],
+                 root_heatmap_size: int = 64,
+                 heatmap_size: Tuple[int, int, int] = [64, 64, 64],
+                 heatmap3d_depth_bound: float = 400.0,
+                 heatmap_size_root: int = 64,
+                 root_depth_bound: float = 400.0,
+                 depth_size: int = 64,
+                 use_different_joint_weights: bool = False,
+                 sigma: int = 2,
+                 joint_indices: Optional[list] = None,
+                 max_bound: float = 1.0):
+        super().__init__()
+        self.image_size = np.array(image_size)
+        self.root_heatmap_size = root_heatmap_size
+        self.heatmap_size = np.array(heatmap_size)
+        self.heatmap3d_depth_bound = heatmap3d_depth_bound
+        self.heatmap_size_root = heatmap_size_root
+        self.root_depth_bound = root_depth_bound
+        self.depth_size = depth_size
+        self.use_different_joint_weights = use_different_joint_weights
+        self.sigma = sigma
+        self.joint_indices = joint_indices
+        self.max_bound = max_bound
+        self.scale_factor = (np.array(image_size) /
+                             heatmap_size[:-1]).astype(np.float32)
+    def encode(
+        self,
+        keypoints: np.ndarray,
+        keypoints_visible: Optional[np.ndarray],
+        dataset_keypoint_weights: Optional[np.ndarray],
+        rel_root_depth: np.float32,
+        rel_root_valid: np.float32,
+        hand_type: np.ndarray,
+        hand_type_valid: np.ndarray,
+        focal: np.ndarray,
+        principal_pt: np.ndarray,
+    ) -> dict:
+        """Encoding keypoints from input image space to input image space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D).
+            keypoints_visible (np.ndarray, optional): Keypoint visibilities in
+                shape (N, K).
+            dataset_keypoint_weights (np.ndarray, optional): Keypoints weight
+                in shape (K, ).
+            rel_root_depth (np.float32): Relative root depth.
+            rel_root_valid (float): Validity of relative root depth.
+            hand_type (np.ndarray): Type of hand encoded as a array.
+            hand_type_valid (np.ndarray): Validity of hand type.
+            focal (np.ndarray): Focal length of camera.
+            principal_pt (np.ndarray): Principal point of camera.
+        Returns:
+            encoded (dict): Contains the following items:
+                - heatmaps (np.ndarray): The generated heatmap in shape
+                  (K * D, H, W) where [W, H, D] is the `heatmap_size`
+                - keypoint_weights (np.ndarray): The target weights in shape
+                  (N, K)
+                - root_depth (np.ndarray): Encoded relative root depth
+                - root_depth_weight (np.ndarray): The weights of relative root
+                  depth
+                - type (np.ndarray): Encoded hand type
+                - type_weight (np.ndarray): The weights of hand type
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:-1], dtype=np.float32)
+        if self.use_different_joint_weights:
+            assert dataset_keypoint_weights is not None, 'To use different ' \
+                'joint weights,`dataset_keypoint_weights` cannot be None.'
+        heatmaps, keypoint_weights = generate_3d_gaussian_heatmaps(
+            heatmap_size=self.heatmap_size,
+            keypoints=keypoints,
+            keypoints_visible=keypoints_visible,
+            sigma=self.sigma,
+            image_size=self.image_size,
+            heatmap3d_depth_bound=self.heatmap3d_depth_bound,
+            joint_indices=self.joint_indices,
+            max_bound=self.max_bound,
+            use_different_joint_weights=self.use_different_joint_weights,
+            dataset_keypoint_weights=dataset_keypoint_weights)
+        rel_root_depth = (rel_root_depth / self.root_depth_bound +
+                          0.5) * self.heatmap_size_root
+        rel_root_valid = rel_root_valid * (rel_root_depth >= 0) * (
+            rel_root_depth <= self.heatmap_size_root)
+        encoded = dict(
+            heatmaps=heatmaps,
+            keypoint_weights=keypoint_weights,
+            root_depth=rel_root_depth * np.ones(1, dtype=np.float32),
+            type=hand_type,
+            type_weight=hand_type_valid,
+            root_depth_weight=rel_root_valid * np.ones(1, dtype=np.float32))
+        return encoded
+    def decode(self, heatmaps: np.ndarray, root_depth: np.ndarray,
+               hand_type: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from heatmaps. The decoded keypoint
+        coordinates are in the input image space.
+        Args:
+            heatmaps (np.ndarray): Heatmaps in shape (K, D, H, W)
+            root_depth (np.ndarray): Root depth prediction.
+            hand_type (np.ndarray): Hand type prediction.
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded keypoint coordinates in shape
+                (N, K, D)
+            - scores (np.ndarray): The keypoint scores in shape (N, K). It
+                usually represents the confidence of the keypoint prediction
+        """
+        heatmap3d = heatmaps.copy()
+        keypoints, scores = get_heatmap_3d_maximum(heatmap3d)
+        # transform keypoint depth to camera space
+        keypoints[..., 2] = (keypoints[..., 2] / self.depth_size -
+                             0.5) * self.heatmap3d_depth_bound
+        # Unsqueeze the instance dimension for single-instance results
+        keypoints, scores = keypoints[None], scores[None]
+        # Restore the keypoint scale
+        keypoints[..., :2] = keypoints[..., :2] * self.scale_factor
+        # decode relative hand root depth
+        # transform relative root depth to camera space
+        rel_root_depth = ((root_depth / self.root_heatmap_size - 0.5) *
+                          self.root_depth_bound)
+        hand_type = (hand_type > 0).reshape(1, -1).astype(int)
+        return keypoints, scores, rel_root_depth, hand_type

mmpose/codecs/image_pose_lifting.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+@KEYPOINT_CODECS.register_module()
+class ImagePoseLifting(BaseKeypointCodec):
+    r"""Generate keypoint coordinates for pose lifter.
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - pose-lifitng target dimension: C
+    Args:
+        num_keypoints (int): The number of keypoints in the dataset.
+        root_index (Union[int, List]): Root keypoint index in the pose.
+        remove_root (bool): If true, remove the root keypoint from the pose.
+            Default: ``False``.
+        save_index (bool): If true, store the root position separated from the
+            original pose. Default: ``False``.
+        reshape_keypoints (bool): If true, reshape the keypoints into shape
+            (-1, N). Default: ``True``.
+        concat_vis (bool): If true, concat the visibility item of keypoints.
+            Default: ``False``.
+        keypoints_mean (np.ndarray, optional): Mean values of keypoints
+            coordinates in shape (K, D).
+        keypoints_std (np.ndarray, optional): Std values of keypoints
+            coordinates in shape (K, D).
+        target_mean (np.ndarray, optional): Mean values of pose-lifitng target
+            coordinates in shape (K, C).
+        target_std (np.ndarray, optional): Std values of pose-lifitng target
+            coordinates in shape (K, C).
+    """
+    auxiliary_encode_keys = {'lifting_target', 'lifting_target_visible'}
+    instance_mapping_table = dict(
+        lifting_target='lifting_target',
+        lifting_target_visible='lifting_target_visible',
+    )
+    label_mapping_table = dict(
+        trajectory_weights='trajectory_weights',
+        lifting_target_label='lifting_target_label',
+        lifting_target_weight='lifting_target_weight')
+    def __init__(self,
+                 num_keypoints: int,
+                 root_index: Union[int, List] = 0,
+                 remove_root: bool = False,
+                 save_index: bool = False,
+                 reshape_keypoints: bool = True,
+                 concat_vis: bool = False,
+                 keypoints_mean: Optional[np.ndarray] = None,
+                 keypoints_std: Optional[np.ndarray] = None,
+                 target_mean: Optional[np.ndarray] = None,
+                 target_std: Optional[np.ndarray] = None,
+                 additional_encode_keys: Optional[List[str]] = None):
+        super().__init__()
+        self.num_keypoints = num_keypoints
+        if isinstance(root_index, int):
+            root_index = [root_index]
+        self.root_index = root_index
+        self.remove_root = remove_root
+        self.save_index = save_index
+        self.reshape_keypoints = reshape_keypoints
+        self.concat_vis = concat_vis
+        if keypoints_mean is not None:
+            assert keypoints_std is not None, 'keypoints_std is None'
+            keypoints_mean = np.array(
+                keypoints_mean,
+                dtype=np.float32).reshape(1, num_keypoints, -1)
+            keypoints_std = np.array(
+                keypoints_std, dtype=np.float32).reshape(1, num_keypoints, -1)
+            assert keypoints_mean.shape == keypoints_std.shape, (
+                f'keypoints_mean.shape {keypoints_mean.shape} != '
+                f'keypoints_std.shape {keypoints_std.shape}')
+        if target_mean is not None:
+            assert target_std is not None, 'target_std is None'
+            target_dim = num_keypoints - 1 if remove_root else num_keypoints
+            target_mean = np.array(
+                target_mean, dtype=np.float32).reshape(1, target_dim, -1)
+            target_std = np.array(
+                target_std, dtype=np.float32).reshape(1, target_dim, -1)
+            assert target_mean.shape == target_std.shape, (
+                f'target_mean.shape {target_mean.shape} != '
+                f'target_std.shape {target_std.shape}')
+        self.keypoints_mean = keypoints_mean
+        self.keypoints_std = keypoints_std
+        self.target_mean = target_mean
+        self.target_std = target_std
+        if additional_encode_keys is not None:
+            self.auxiliary_encode_keys.update(additional_encode_keys)
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None,
+               lifting_target: Optional[np.ndarray] = None,
+               lifting_target_visible: Optional[np.ndarray] = None) -> dict:
+        """Encoding keypoints from input image space to normalized space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D).
+            keypoints_visible (np.ndarray, optional): Keypoint visibilities in
+                shape (N, K).
+            lifting_target (np.ndarray, optional): 3d target coordinate in
+                shape (T, K, C).
+            lifting_target_visible (np.ndarray, optional): Target coordinate in
+                shape (T, K, ).
+        Returns:
+            encoded (dict): Contains the following items:
+                - keypoint_labels (np.ndarray): The processed keypoints in
+                  shape like (N, K, D) or (K * D, N).
+                - keypoint_labels_visible (np.ndarray): The processed
+                  keypoints' weights in shape (N, K, ) or (N-1, K, ).
+                - lifting_target_label: The processed target coordinate in
+                  shape (K, C) or (K-1, C).
+                - lifting_target_weight (np.ndarray): The target weights in
+                  shape (K, ) or (K-1, ).
+                - trajectory_weights (np.ndarray): The trajectory weights in
+                  shape (K, ).
+                - target_root (np.ndarray): The root coordinate of target in
+                  shape (C, ).
+                In addition, there are some optional items it may contain:
+                - target_root (np.ndarray): The root coordinate of target in
+                  shape (C, ). Exists if ``zero_center`` is ``True``.
+                - target_root_removed (bool): Indicate whether the root of
+                  pose-lifitng target is removed. Exists if
+                  ``remove_root`` is ``True``.
+                - target_root_index (int): An integer indicating the index of
+                  root. Exists if ``remove_root`` and ``save_index``
+                  are ``True``.
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        if lifting_target is None:
+            lifting_target = [keypoints[0]]
+        # set initial value for `lifting_target_weight`
+        # and `trajectory_weights`
+        if lifting_target_visible is None:
+            lifting_target_visible = np.ones(
+                lifting_target.shape[:-1], dtype=np.float32)
+            lifting_target_weight = lifting_target_visible
+            trajectory_weights = (1 / lifting_target[:, 2])
+        else:
+            valid = lifting_target_visible > 0.5
+            lifting_target_weight = np.where(valid, 1., 0.).astype(np.float32)
+            trajectory_weights = lifting_target_weight
+        encoded = dict()
+        # Zero-center the target pose around a given root keypoint
+        assert (lifting_target.ndim >= 2 and
+                lifting_target.shape[-2] > max(self.root_index)), \
+            f'Got invalid joint shape {lifting_target.shape}'
+        root = np.mean(
+            lifting_target[..., self.root_index, :], axis=-2, dtype=np.float32)
+        lifting_target_label = lifting_target - root[np.newaxis, ...]
+        if self.remove_root and len(self.root_index) == 1:
+            root_index = self.root_index[0]
+            lifting_target_label = np.delete(
+                lifting_target_label, root_index, axis=-2)
+            lifting_target_visible = np.delete(
+                lifting_target_visible, root_index, axis=-2)
+            assert lifting_target_weight.ndim in {
+                2, 3
+            }, (f'lifting_target_weight.ndim {lifting_target_weight.ndim} '
+                'is not in {2, 3}')
+            axis_to_remove = -2 if lifting_target_weight.ndim == 3 else -1
+            lifting_target_weight = np.delete(
+                lifting_target_weight, root_index, axis=axis_to_remove)
+            # Add a flag to avoid latter transforms that rely on the root
+            # joint or the original joint index
+            encoded['target_root_removed'] = True
+            # Save the root index which is necessary to restore the global pose
+            if self.save_index:
+                encoded['target_root_index'] = root_index
+        # Normalize the 2D keypoint coordinate with mean and std
+        keypoint_labels = keypoints.copy()
+        if self.keypoints_mean is not None:
+            assert self.keypoints_mean.shape[1:] == keypoints.shape[1:], (
+                f'self.keypoints_mean.shape[1:] {self.keypoints_mean.shape[1:]} '  # noqa
+                f'!= keypoints.shape[1:] {keypoints.shape[1:]}')
+            encoded['keypoints_mean'] = self.keypoints_mean.copy()
+            encoded['keypoints_std'] = self.keypoints_std.copy()
+            keypoint_labels = (keypoint_labels -
+                               self.keypoints_mean) / self.keypoints_std
+        if self.target_mean is not None:
+            assert self.target_mean.shape == lifting_target_label.shape, (
+                f'self.target_mean.shape {self.target_mean.shape} '
+                f'!= lifting_target_label.shape {lifting_target_label.shape}'  # noqa
+            )
+            encoded['target_mean'] = self.target_mean.copy()
+            encoded['target_std'] = self.target_std.copy()
+            lifting_target_label = (lifting_target_label -
+                                    self.target_mean) / self.target_std
+        # Generate reshaped keypoint coordinates
+        assert keypoint_labels.ndim in {
+            2, 3
+        }, (f'keypoint_labels.ndim {keypoint_labels.ndim} is not in {2, 3}')
+        if keypoint_labels.ndim == 2:
+            keypoint_labels = keypoint_labels[None, ...]
+        if self.concat_vis:
+            keypoints_visible_ = keypoints_visible
+            if keypoints_visible.ndim == 2:
+                keypoints_visible_ = keypoints_visible[..., None]
+            keypoint_labels = np.concatenate(
+                (keypoint_labels, keypoints_visible_), axis=2)
+        if self.reshape_keypoints:
+            N = keypoint_labels.shape[0]
+            keypoint_labels = keypoint_labels.transpose(1, 2, 0).reshape(-1, N)
+        encoded['keypoint_labels'] = keypoint_labels
+        encoded['keypoint_labels_visible'] = keypoints_visible
+        encoded['lifting_target_label'] = lifting_target_label
+        encoded['lifting_target_weight'] = lifting_target_weight
+        encoded['trajectory_weights'] = trajectory_weights
+        encoded['target_root'] = root
+        return encoded
+    def decode(self,
+               encoded: np.ndarray,
+               target_root: Optional[np.ndarray] = None
+               ) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from normalized space to input image
+        space.
+        Args:
+            encoded (np.ndarray): Coordinates in shape (N, K, C).
+            target_root (np.ndarray, optional): The target root coordinate.
+                Default: ``None``.
+        Returns:
+            keypoints (np.ndarray): Decoded coordinates in shape (N, K, C).
+            scores (np.ndarray): The keypoint scores in shape (N, K).
+        """
+        keypoints = encoded.copy()
+        if self.target_mean is not None and self.target_std is not None:
+            assert self.target_mean.shape == keypoints.shape, (
+                f'self.target_mean.shape {self.target_mean.shape} '
+                f'!= keypoints.shape {keypoints.shape}')
+            keypoints = keypoints * self.target_std + self.target_mean
+        if target_root is not None and target_root.size > 0:
+            keypoints = keypoints + target_root
+            if self.remove_root and len(self.root_index) == 1:
+                keypoints = np.insert(
+                    keypoints, self.root_index, target_root, axis=1)
+        scores = np.ones(keypoints.shape[:-1], dtype=np.float32)
+        return keypoints, scores

mmpose/codecs/integral_regression_label.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .msra_heatmap import MSRAHeatmap
+from .regression_label import RegressionLabel
+@KEYPOINT_CODECS.register_module()
+class IntegralRegressionLabel(BaseKeypointCodec):
+    """Generate keypoint coordinates and normalized heatmaps. See the paper:
+    `DSNT`_ by Nibali et al(2018).
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+    Encoded:
+        - keypoint_labels (np.ndarray): The normalized regression labels in
+            shape (N, K, D) where D is 2 for 2d coordinates
+        - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W) where
+            [W, H] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape (N, K)
+    Args:
+        input_size (tuple): Input image size in [w, h]
+        heatmap_size (tuple): Heatmap size in [W, H]
+        sigma (float): The sigma value of the Gaussian heatmap
+        unbiased (bool): Whether use unbiased method (DarkPose) in ``'msra'``
+            encoding. See `Dark Pose`_ for details. Defaults to ``False``
+        blur_kernel_size (int): The Gaussian blur kernel size of the heatmap
+            modulation in DarkPose. The kernel size and sigma should follow
+            the expirical formula :math:`sigma = 0.3*((ks-1)*0.5-1)+0.8`.
+            Defaults to 11
+        normalize (bool): Whether to normalize the heatmaps. Defaults to True.
+    .. _`DSNT`: https://arxiv.org/abs/1801.07372
+    """
+    label_mapping_table = dict(
+        keypoint_labels='keypoint_labels',
+        keypoint_weights='keypoint_weights',
+    )
+    field_mapping_table = dict(heatmaps='heatmaps', )
+    def __init__(self,
+                 input_size: Tuple[int, int],
+                 heatmap_size: Tuple[int, int],
+                 sigma: float,
+                 unbiased: bool = False,
+                 blur_kernel_size: int = 11,
+                 normalize: bool = True) -> None:
+        super().__init__()
+        self.heatmap_codec = MSRAHeatmap(input_size, heatmap_size, sigma,
+                                         unbiased, blur_kernel_size)
+        self.keypoint_codec = RegressionLabel(input_size)
+        self.normalize = normalize
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None) -> dict:
+        """Encoding keypoints to regression labels and heatmaps.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+        Returns:
+            dict:
+            - keypoint_labels (np.ndarray): The normalized regression labels in
+                shape (N, K, D) where D is 2 for 2d coordinates
+            - heatmaps (np.ndarray): The generated heatmap in shape
+                (K, H, W) where [W, H] is the `heatmap_size`
+            - keypoint_weights (np.ndarray): The target weights in shape
+                (N, K)
+        """
+        encoded_hm = self.heatmap_codec.encode(keypoints, keypoints_visible)
+        encoded_kp = self.keypoint_codec.encode(keypoints, keypoints_visible)
+        heatmaps = encoded_hm['heatmaps']
+        keypoint_labels = encoded_kp['keypoint_labels']
+        keypoint_weights = encoded_kp['keypoint_weights']
+        if self.normalize:
+            val_sum = heatmaps.sum(axis=(-1, -2)).reshape(-1, 1, 1) + 1e-24
+            heatmaps = heatmaps / val_sum
+        encoded = dict(
+            keypoint_labels=keypoint_labels,
+            heatmaps=heatmaps,
+            keypoint_weights=keypoint_weights)
+        return encoded
+    def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from normalized space to input image
+        space.
+        Args:
+            encoded (np.ndarray): Coordinates in shape (N, K, D)
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded coordinates in shape (N, K, D)
+            - socres (np.ndarray): The keypoint scores in shape (N, K).
+                It usually represents the confidence of the keypoint prediction
+        """
+        keypoints, scores = self.keypoint_codec.decode(encoded)
+        return keypoints, scores

mmpose/codecs/megvii_heatmap.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import product
+from typing import Optional, Tuple
+import cv2
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils import gaussian_blur, get_heatmap_maximum
+@KEYPOINT_CODECS.register_module()
+class MegviiHeatmap(BaseKeypointCodec):
+    """Represent keypoints as heatmaps via "Megvii" approach. See `MSPN`_
+    (2019) and `CPN`_ (2018) for details.
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+        - heatmap size: [W, H]
+    Encoded:
+        - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W)
+            where [W, H] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape (N, K)
+    Args:
+        input_size (tuple): Image size in [w, h]
+        heatmap_size (tuple): Heatmap size in [W, H]
+        kernel_size (tuple): The kernel size of the heatmap gaussian in
+            [ks_x, ks_y]
+    .. _`MSPN`: https://arxiv.org/abs/1901.00148
+    .. _`CPN`: https://arxiv.org/abs/1711.07319
+    """
+    label_mapping_table = dict(keypoint_weights='keypoint_weights', )
+    field_mapping_table = dict(heatmaps='heatmaps', )
+    def __init__(
+        self,
+        input_size: Tuple[int, int],
+        heatmap_size: Tuple[int, int],
+        kernel_size: int,
+    ) -> None:
+        super().__init__()
+        self.input_size = input_size
+        self.heatmap_size = heatmap_size
+        self.kernel_size = kernel_size
+        self.scale_factor = (np.array(input_size) /
+                             heatmap_size).astype(np.float32)
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None) -> dict:
+        """Encode keypoints into heatmaps. Note that the original keypoint
+        coordinates should be in the input image space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+        Returns:
+            dict:
+            - heatmaps (np.ndarray): The generated heatmap in shape
+                (K, H, W) where [W, H] is the `heatmap_size`
+            - keypoint_weights (np.ndarray): The target weights in shape
+                (N, K)
+        """
+        N, K, _ = keypoints.shape
+        W, H = self.heatmap_size
+        assert N == 1, (
+            f'{self.__class__.__name__} only support single-instance '
+            'keypoint encoding')
+        heatmaps = np.zeros((K, H, W), dtype=np.float32)
+        keypoint_weights = keypoints_visible.copy()
+        for n, k in product(range(N), range(K)):
+            # skip unlabled keypoints
+            if keypoints_visible[n, k] < 0.5:
+                continue
+            # get center coordinates
+            kx, ky = (keypoints[n, k] / self.scale_factor).astype(np.int64)
+            if kx < 0 or kx >= W or ky < 0 or ky >= H:
+                keypoint_weights[n, k] = 0
+                continue
+            heatmaps[k, ky, kx] = 1.
+            kernel_size = (self.kernel_size, self.kernel_size)
+            heatmaps[k] = cv2.GaussianBlur(heatmaps[k], kernel_size, 0)
+            # normalize the heatmap
+            heatmaps[k] = heatmaps[k] / heatmaps[k, ky, kx] * 255.
+        encoded = dict(heatmaps=heatmaps, keypoint_weights=keypoint_weights)
+        return encoded
+    def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from heatmaps. The decoded keypoint
+        coordinates are in the input image space.
+        Args:
+            encoded (np.ndarray): Heatmaps in shape (K, H, W)
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded keypoint coordinates in shape
+                (K, D)
+            - scores (np.ndarray): The keypoint scores in shape (K,). It
+                usually represents the confidence of the keypoint prediction
+        """
+        heatmaps = gaussian_blur(encoded.copy(), self.kernel_size)
+        K, H, W = heatmaps.shape
+        keypoints, scores = get_heatmap_maximum(heatmaps)
+        for k in range(K):
+            heatmap = heatmaps[k]
+            px = int(keypoints[k, 0])
+            py = int(keypoints[k, 1])
+            if 1 < px < W - 1 and 1 < py < H - 1:
+                diff = np.array([
+                    heatmap[py][px + 1] - heatmap[py][px - 1],
+                    heatmap[py + 1][px] - heatmap[py - 1][px]
+                ])
+                keypoints[k] += (np.sign(diff) * 0.25 + 0.5)
+        scores = scores / 255.0 + 0.5
+        # Unsqueeze the instance dimension for single-instance results
+        # and restore the keypoint scales
+        keypoints = keypoints[None] * self.scale_factor
+        scores = scores[None]
+        return keypoints, scores

mmpose/codecs/motionbert_label.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from typing import Optional, Tuple
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils import camera_to_image_coord
+@KEYPOINT_CODECS.register_module()
+class MotionBERTLabel(BaseKeypointCodec):
+    r"""Generate keypoint and label coordinates for `MotionBERT`_ by Zhu et al
+    (2022).
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - pose-lifitng target dimension: C
+    Args:
+        num_keypoints (int): The number of keypoints in the dataset.
+        root_index (int): Root keypoint index in the pose. Default: 0.
+        remove_root (bool): If true, remove the root keypoint from the pose.
+            Default: ``False``.
+        save_index (bool): If true, store the root position separated from the
+            original pose, only takes effect if ``remove_root`` is ``True``.
+            Default: ``False``.
+        concat_vis (bool): If true, concat the visibility item of keypoints.
+            Default: ``False``.
+        rootrel (bool): If true, the root keypoint will be set to the
+            coordinate origin. Default: ``False``.
+        mode (str): Indicating whether the current mode is 'train' or 'test'.
+            Default: ``'test'``.
+    """
+    auxiliary_encode_keys = {
+        'lifting_target', 'lifting_target_visible', 'camera_param', 'factor'
+    }
+    instance_mapping_table = dict(
+        lifting_target='lifting_target',
+        lifting_target_visible='lifting_target_visible',
+    )
+    label_mapping_table = dict(
+        trajectory_weights='trajectory_weights',
+        lifting_target_label='lifting_target_label',
+        lifting_target_weight='lifting_target_weight')
+    def __init__(self,
+                 num_keypoints: int,
+                 root_index: int = 0,
+                 remove_root: bool = False,
+                 save_index: bool = False,
+                 concat_vis: bool = False,
+                 rootrel: bool = False,
+                 mode: str = 'test'):
+        super().__init__()
+        self.num_keypoints = num_keypoints
+        self.root_index = root_index
+        self.remove_root = remove_root
+        self.save_index = save_index
+        self.concat_vis = concat_vis
+        self.rootrel = rootrel
+        assert mode.lower() in {'train', 'test'
+                                }, (f'Unsupported mode {mode}, '
+                                    'mode should be one of ("train", "test").')
+        self.mode = mode.lower()
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None,
+               lifting_target: Optional[np.ndarray] = None,
+               lifting_target_visible: Optional[np.ndarray] = None,
+               camera_param: Optional[dict] = None,
+               factor: Optional[np.ndarray] = None) -> dict:
+        """Encoding keypoints from input image space to normalized space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (B, T, K, D).
+            keypoints_visible (np.ndarray, optional): Keypoint visibilities in
+                shape (B, T, K).
+            lifting_target (np.ndarray, optional): 3d target coordinate in
+                shape (T, K, C).
+            lifting_target_visible (np.ndarray, optional): Target coordinate in
+                shape (T, K, ).
+            camera_param (dict, optional): The camera parameter dictionary.
+            factor (np.ndarray, optional): The factor mapping camera and image
+                  coordinate in shape (T, ).
+        Returns:
+            encoded (dict): Contains the following items:
+                - keypoint_labels (np.ndarray): The processed keypoints in
+                  shape like (N, K, D).
+                - keypoint_labels_visible (np.ndarray): The processed
+                  keypoints' weights in shape (N, K, ) or (N, K-1, ).
+                - lifting_target_label: The processed target coordinate in
+                  shape (K, C) or (K-1, C).
+                - lifting_target_weight (np.ndarray): The target weights in
+                  shape (K, ) or (K-1, ).
+                - factor (np.ndarray): The factor mapping camera and image
+                  coordinate in shape (T, 1).
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        # set initial value for `lifting_target_weight`
+        if lifting_target_visible is None:
+            lifting_target_visible = np.ones(
+                lifting_target.shape[:-1], dtype=np.float32)
+            lifting_target_weight = lifting_target_visible
+        else:
+            valid = lifting_target_visible > 0.5
+            lifting_target_weight = np.where(valid, 1., 0.).astype(np.float32)
+        if camera_param is None:
+            camera_param = dict()
+        encoded = dict()
+        assert lifting_target is not None
+        lifting_target_label = lifting_target.copy()
+        keypoint_labels = keypoints.copy()
+        assert keypoint_labels.ndim in {
+            2, 3
+        }, (f'Keypoint labels should have 2 or 3 dimensions, '
+            f'but got {keypoint_labels.ndim}.')
+        if keypoint_labels.ndim == 2:
+            keypoint_labels = keypoint_labels[None, ...]
+        # Normalize the 2D keypoint coordinate with image width and height
+        _camera_param = deepcopy(camera_param)
+        assert 'w' in _camera_param and 'h' in _camera_param, (
+            'Camera parameters should contain "w" and "h".')
+        w, h = _camera_param['w'], _camera_param['h']
+        keypoint_labels[
+            ..., :2] = keypoint_labels[..., :2] / w * 2 - [1, h / w]
+        # convert target to image coordinate
+        T = keypoint_labels.shape[0]
+        factor_ = np.array([4] * T, dtype=np.float32).reshape(T, )
+        if 'f' in _camera_param and 'c' in _camera_param:
+            lifting_target_label, factor_ = camera_to_image_coord(
+                self.root_index, lifting_target_label, _camera_param)
+        if self.mode == 'train':
+            w, h = w / 1000, h / 1000
+            lifting_target_label[
+                ..., :2] = lifting_target_label[..., :2] / w * 2 - [1, h / w]
+            lifting_target_label[..., 2] = lifting_target_label[..., 2] / w * 2
+        lifting_target_label[..., :, :] = lifting_target_label[
+            ..., :, :] - lifting_target_label[...,
+                                              self.root_index:self.root_index +
+                                              1, :]
+        if factor is None or factor[0] == 0:
+            factor = factor_
+        if factor.ndim == 1:
+            factor = factor[:, None]
+        if self.mode == 'test':
+            lifting_target_label *= factor[..., None]
+        if self.concat_vis:
+            keypoints_visible_ = keypoints_visible
+            if keypoints_visible.ndim == 2:
+                keypoints_visible_ = keypoints_visible[..., None]
+            keypoint_labels = np.concatenate(
+                (keypoint_labels, keypoints_visible_), axis=2)
+        encoded['keypoint_labels'] = keypoint_labels
+        encoded['keypoint_labels_visible'] = keypoints_visible
+        encoded['lifting_target_label'] = lifting_target_label
+        encoded['lifting_target_weight'] = lifting_target_weight
+        encoded['lifting_target'] = lifting_target_label
+        encoded['lifting_target_visible'] = lifting_target_visible
+        encoded['factor'] = factor
+        return encoded
+    def decode(
+        self,
+        encoded: np.ndarray,
+        w: Optional[np.ndarray] = None,
+        h: Optional[np.ndarray] = None,
+        factor: Optional[np.ndarray] = None,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from normalized space to input image
+        space.
+        Args:
+            encoded (np.ndarray): Coordinates in shape (N, K, C).
+            w (np.ndarray, optional): The image widths in shape (N, ).
+                Default: ``None``.
+            h (np.ndarray, optional): The image heights in shape (N, ).
+                Default: ``None``.
+            factor (np.ndarray, optional): The factor for projection in shape
+                (N, ). Default: ``None``.
+        Returns:
+            keypoints (np.ndarray): Decoded coordinates in shape (N, K, C).
+            scores (np.ndarray): The keypoint scores in shape (N, K).
+        """
+        keypoints = encoded.copy()
+        scores = np.ones(keypoints.shape[:-1], dtype=np.float32)
+        if self.rootrel:
+            keypoints[..., 0, :] = 0
+        if w is not None and w.size > 0:
+            assert w.shape == h.shape, (f'w and h should have the same shape, '
+                                        f'but got {w.shape} and {h.shape}.')
+            assert w.shape[0] == keypoints.shape[0], (
+                f'w and h should have the same batch size, '
+                f'but got {w.shape[0]} and {keypoints.shape[0]}.')
+            assert w.ndim in {1,
+                              2}, (f'w and h should have 1 or 2 dimensions, '
+                                   f'but got {w.ndim}.')
+            if w.ndim == 1:
+                w = w[:, None]
+                h = h[:, None]
+            trans = np.append(
+                np.ones((w.shape[0], 1)), h / w, axis=1)[:, None, :]
+            keypoints[..., :2] = (keypoints[..., :2] + trans) * w[:, None] / 2
+            keypoints[..., 2:] = keypoints[..., 2:] * w[:, None] / 2
+        if factor is not None and factor.size > 0:
+            assert factor.shape[0] == keypoints.shape[0], (
+                f'factor should have the same batch size, '
+                f'but got {factor.shape[0]} and {keypoints.shape[0]}.')
+            keypoints *= factor[..., None]
+        keypoints[..., :, :] = keypoints[..., :, :] - keypoints[
+            ..., self.root_index:self.root_index + 1, :]
+        keypoints /= 1000.
+        return keypoints, scores

mmpose/codecs/msra_heatmap.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils.gaussian_heatmap import (generate_gaussian_heatmaps,
+                                     generate_unbiased_gaussian_heatmaps)
+from .utils.post_processing import get_heatmap_maximum
+from .utils.refinement import refine_keypoints, refine_keypoints_dark
+@KEYPOINT_CODECS.register_module()
+class MSRAHeatmap(BaseKeypointCodec):
+    """Represent keypoints as heatmaps via "MSRA" approach. See the paper:
+    `Simple Baselines for Human Pose Estimation and Tracking`_ by Xiao et al
+    (2018) for details.
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+        - heatmap size: [W, H]
+    Encoded:
+        - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W)
+            where [W, H] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape (N, K)
+    Args:
+        input_size (tuple): Image size in [w, h]
+        heatmap_size (tuple): Heatmap size in [W, H]
+        sigma (float): The sigma value of the Gaussian heatmap
+        unbiased (bool): Whether use unbiased method (DarkPose) in ``'msra'``
+            encoding. See `Dark Pose`_ for details. Defaults to ``False``
+        blur_kernel_size (int): The Gaussian blur kernel size of the heatmap
+            modulation in DarkPose. The kernel size and sigma should follow
+            the expirical formula :math:`sigma = 0.3*((ks-1)*0.5-1)+0.8`.
+            Defaults to 11
+    .. _`Simple Baselines for Human Pose Estimation and Tracking`:
+        https://arxiv.org/abs/1804.06208
+    .. _`Dark Pose`: https://arxiv.org/abs/1910.06278
+    """
+    label_mapping_table = dict(keypoint_weights='keypoint_weights', )
+    field_mapping_table = dict(heatmaps='heatmaps', )
+    def __init__(self,
+                 input_size: Tuple[int, int],
+                 heatmap_size: Tuple[int, int],
+                 sigma: float,
+                 unbiased: bool = False,
+                 blur_kernel_size: int = 11) -> None:
+        super().__init__()
+        self.input_size = input_size
+        self.heatmap_size = heatmap_size
+        self.sigma = sigma
+        self.unbiased = unbiased
+        # The Gaussian blur kernel size of the heatmap modulation
+        # in DarkPose and the sigma value follows the expirical
+        # formula :math:`sigma = 0.3*((ks-1)*0.5-1)+0.8`
+        # which gives:
+        #   sigma~=3 if ks=17
+        #   sigma=2 if ks=11;
+        #   sigma~=1.5 if ks=7;
+        #   sigma~=1 if ks=3;
+        self.blur_kernel_size = blur_kernel_size
+        self.scale_factor = (np.array(input_size) /
+                             heatmap_size).astype(np.float32)
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None) -> dict:
+        """Encode keypoints into heatmaps. Note that the original keypoint
+        coordinates should be in the input image space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+        Returns:
+            dict:
+            - heatmaps (np.ndarray): The generated heatmap in shape
+                (K, H, W) where [W, H] is the `heatmap_size`
+            - keypoint_weights (np.ndarray): The target weights in shape
+                (N, K)
+        """
+        assert keypoints.shape[0] == 1, (
+            f'{self.__class__.__name__} only support single-instance '
+            'keypoint encoding')
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        if self.unbiased:
+            heatmaps, keypoint_weights = generate_unbiased_gaussian_heatmaps(
+                heatmap_size=self.heatmap_size,
+                keypoints=keypoints / self.scale_factor,
+                keypoints_visible=keypoints_visible,
+                sigma=self.sigma)
+        else:
+            heatmaps, keypoint_weights = generate_gaussian_heatmaps(
+                heatmap_size=self.heatmap_size,
+                keypoints=keypoints / self.scale_factor,
+                keypoints_visible=keypoints_visible,
+                sigma=self.sigma)
+        encoded = dict(heatmaps=heatmaps, keypoint_weights=keypoint_weights)
+        return encoded
+    def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from heatmaps. The decoded keypoint
+        coordinates are in the input image space.
+        Args:
+            encoded (np.ndarray): Heatmaps in shape (K, H, W)
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded keypoint coordinates in shape
+                (N, K, D)
+            - scores (np.ndarray): The keypoint scores in shape (N, K). It
+                usually represents the confidence of the keypoint prediction
+        """
+        heatmaps = encoded.copy()
+        K, H, W = heatmaps.shape
+        keypoints, scores = get_heatmap_maximum(heatmaps)
+        # Unsqueeze the instance dimension for single-instance results
+        keypoints, scores = keypoints[None], scores[None]
+        if self.unbiased:
+            # Alleviate biased coordinate
+            keypoints = refine_keypoints_dark(
+                keypoints, heatmaps, blur_kernel_size=self.blur_kernel_size)
+        else:
+            keypoints = refine_keypoints(keypoints, heatmaps)
+        # Restore the keypoint scale
+        keypoints = keypoints * self.scale_factor
+        return keypoints, scores

mmpose/codecs/onehot_heatmap.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+import cv2
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils import (generate_offset_heatmap, generate_onehot_heatmaps,
+                    get_heatmap_maximum, refine_keypoints_dark_udp)
+@KEYPOINT_CODECS.register_module()
+class OneHotHeatmap(BaseKeypointCodec):
+    r"""Generate keypoint heatmaps by Unbiased Data Processing (UDP).
+    See the paper: `The Devil is in the Details: Delving into Unbiased Data
+    Processing for Human Pose Estimation`_ by Huang et al (2020) for details.
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+        - heatmap size: [W, H]
+    Encoded:
+        - heatmap (np.ndarray): The generated heatmap in shape (C_out, H, W)
+            where [W, H] is the `heatmap_size`, and the C_out is the output
+            channel number which depends on the `heatmap_type`. If
+            `heatmap_type=='gaussian'`, C_out equals to keypoint number K;
+            if `heatmap_type=='combined'`, C_out equals to K*3
+            (x_offset, y_offset and class label)
+        - keypoint_weights (np.ndarray): The target weights in shape (K,)
+    Args:
+        input_size (tuple): Image size in [w, h]
+        heatmap_size (tuple): Heatmap size in [W, H]
+        heatmap_type (str): The heatmap type to encode the keypoitns. Options
+            are:
+            - ``'gaussian'``: Gaussian heatmap
+            - ``'combined'``: Combination of a binary label map and offset
+                maps for X and Y axes.
+        sigma (float): The sigma value of the Gaussian heatmap when
+            ``heatmap_type=='gaussian'``. Defaults to 2.0
+        radius_factor (float): The radius factor of the binary label
+            map when ``heatmap_type=='combined'``. The positive region is
+            defined as the neighbor of the keypoit with the radius
+            :math:`r=radius_factor*max(W, H)`. Defaults to 0.0546875
+        blur_kernel_size (int): The Gaussian blur kernel size of the heatmap
+            modulation in DarkPose. Defaults to 11
+    .. _`The Devil is in the Details: Delving into Unbiased Data Processing for
+    Human Pose Estimation`: https://arxiv.org/abs/1911.07524
+    """
+    label_mapping_table = dict(keypoint_weights='keypoint_weights', )
+    field_mapping_table = dict(heatmaps='heatmaps', )
+    def __init__(self,
+                 input_size: Tuple[int, int],
+                 heatmap_size: Tuple[int, int],
+                 heatmap_type: str = 'gaussian',
+                 sigma: float = 2.,
+                 radius_factor: float = 0.0546875,
+                 blur_kernel_size: int = 11,
+                 increase_sigma_with_padding=False,
+                 amap_scale: float = 1.0,
+                 normalize=None,
+                 ) -> None:
+        super().__init__()
+        self.input_size = np.array(input_size)
+        self.heatmap_size = np.array(heatmap_size)
+        self.sigma = sigma
+        self.radius_factor = radius_factor
+        self.heatmap_type = heatmap_type
+        self.blur_kernel_size = blur_kernel_size
+        self.increase_sigma_with_padding = increase_sigma_with_padding
+        self.normalize = normalize
+        self.amap_size = self.input_size * amap_scale
+        self.scale_factor = ((self.amap_size - 1) /
+                             (self.heatmap_size - 1)).astype(np.float32)
+        self.input_center = self.input_size / 2
+        self.top_left = self.input_center - self.amap_size / 2
+        if self.heatmap_type not in {'gaussian', 'combined'}:
+            raise ValueError(
+                f'{self.__class__.__name__} got invalid `heatmap_type` value'
+                f'{self.heatmap_type}. Should be one of '
+                '{"gaussian", "combined"}')
+    def _kpts_to_activation_pts(self, keypoints: np.ndarray) -> np.ndarray:
+        """
+        Transform the keypoint coordinates to the activation space.
+        In the original UDPHeatmap, activation map is the same as the input image space with
+        different resolution but in this case we allow the activation map to have different
+        size (padding) than the input image space.
+        Centers of activation map and input image space are aligned.
+        """
+        transformed_keypoints = keypoints - self.top_left
+        transformed_keypoints = transformed_keypoints / self.scale_factor
+        return transformed_keypoints
+    def _activation_pts_to_kpts(self, keypoints: np.ndarray) -> np.ndarray:
+        """
+        Transform the points in activation map to the keypoint coordinates.
+        In the original UDPHeatmap, activation map is the same as the input image space with
+        different resolution but in this case we allow the activation map to have different
+        size (padding) than the input image space.
+        Centers of activation map and input image space are aligned.
+        """
+        W, H = self.heatmap_size
+        transformed_keypoints = keypoints / [W - 1, H - 1] * self.amap_size
+        transformed_keypoints += self.top_left
+        return transformed_keypoints
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None,
+               id_similarity: Optional[float] = 0.0,
+               keypoints_visibility: Optional[np.ndarray] = None) -> dict:
+        """Encode keypoints into heatmaps. Note that the original keypoint
+        coordinates should be in the input image space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+            id_similarity (float): The usefulness of the identity information
+                for the whole pose. Defaults to 0.0
+            keypoints_visibility (np.ndarray): The visibility bit for each
+                keypoint (N, K). Defaults to None
+        Returns:
+            dict:
+            - heatmap (np.ndarray): The generated heatmap in shape
+                (C_out, H, W) where [W, H] is the `heatmap_size`, and the
+                C_out is the output channel number which depends on the
+                `heatmap_type`. If `heatmap_type=='gaussian'`, C_out equals to
+                keypoint number K; if `heatmap_type=='combined'`, C_out
+                equals to K*3 (x_offset, y_offset and class label)
+            - keypoint_weights (np.ndarray): The target weights in shape
+                (K,)
+        """
+        assert keypoints.shape[0] == 1, (
+            f'{self.__class__.__name__} only support single-instance '
+            'keypoint encoding')
+        if keypoints_visibility is None:
+            keypoints_visibility = np.zeros(keypoints.shape[:2], dtype=np.float32)
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        if self.heatmap_type == 'gaussian':
+            heatmaps, keypoint_weights = generate_onehot_heatmaps(
+                heatmap_size=self.heatmap_size,
+                keypoints=self._kpts_to_activation_pts(keypoints),
+                keypoints_visible=keypoints_visible,
+                sigma=self.sigma,
+                keypoints_visibility=keypoints_visibility,
+                increase_sigma_with_padding=self.increase_sigma_with_padding)
+        elif self.heatmap_type == 'combined':
+            heatmaps, keypoint_weights = generate_offset_heatmap(
+                heatmap_size=self.heatmap_size,
+                keypoints=self._kpts_to_activation_pts(keypoints),
+                keypoints_visible=keypoints_visible,
+                radius_factor=self.radius_factor)
+        else:
+            raise ValueError(
+                f'{self.__class__.__name__} got invalid `heatmap_type` value'
+                f'{self.heatmap_type}. Should be one of '
+                '{"gaussian", "combined"}')
+        if self.normalize is not None:
+            heatmaps_sum = np.sum(heatmaps, axis=(1, 2), keepdims=False)
+            mask = heatmaps_sum > 0
+            heatmaps[mask, :, :] = heatmaps[mask, :, :] / (heatmaps_sum[mask, None, None] + np.finfo(np.float32).eps)
+            heatmaps = heatmaps * self.normalize
+        annotated = keypoints_visible > 0
+        heatmap_keypoints = self._kpts_to_activation_pts(keypoints)
+        in_image = np.logical_and(
+            heatmap_keypoints[:, :, 0] >= 0,
+            heatmap_keypoints[:, :, 0] < self.heatmap_size[0],
+        )
+        in_image = np.logical_and(
+            in_image,
+            heatmap_keypoints[:, :, 1] >= 0,
+        )
+        in_image = np.logical_and(
+            in_image,
+            heatmap_keypoints[:, :, 1] < self.heatmap_size[1],
+        )
+        encoded = dict(
+            heatmaps=heatmaps,
+            keypoint_weights=keypoint_weights,
+            annotated=annotated,
+            in_image=in_image,
+            keypoints_scaled=keypoints,
+            heatmap_keypoints=heatmap_keypoints,
+            identification_similarity=id_similarity,
+        )
+        return encoded
+    def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from heatmaps. The decoded keypoint
+        coordinates are in the input image space.
+        Args:
+            encoded (np.ndarray): Heatmaps in shape (K, H, W)
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded keypoint coordinates in shape
+                (N, K, D)
+            - scores (np.ndarray): The keypoint scores in shape (N, K). It
+                usually represents the confidence of the keypoint prediction
+        """
+        heatmaps = encoded.copy()
+        if self.heatmap_type == 'gaussian':
+            keypoints, scores = get_heatmap_maximum(heatmaps)
+            # unsqueeze the instance dimension for single-instance results
+            keypoints = keypoints[None]
+            scores = scores[None]
+            keypoints = refine_keypoints_dark_udp(
+                keypoints, heatmaps, blur_kernel_size=self.blur_kernel_size)
+        elif self.heatmap_type == 'combined':
+            _K, H, W = heatmaps.shape
+            K = _K // 3
+            for cls_heatmap in heatmaps[::3]:
+                # Apply Gaussian blur on classification maps
+                ks = 2 * self.blur_kernel_size + 1
+                cv2.GaussianBlur(cls_heatmap, (ks, ks), 0, cls_heatmap)
+            # valid radius
+            radius = self.radius_factor * max(W, H)
+            x_offset = heatmaps[1::3].flatten() * radius
+            y_offset = heatmaps[2::3].flatten() * radius
+            keypoints, scores = get_heatmap_maximum(heatmaps=heatmaps[::3])
+            index = (keypoints[..., 0] + keypoints[..., 1] * W).flatten()
+            index += W * H * np.arange(0, K)
+            index = index.astype(int)
+            keypoints += np.stack((x_offset[index], y_offset[index]), axis=-1)
+            # unsqueeze the instance dimension for single-instance results
+            keypoints = keypoints[None].astype(np.float32)
+            scores = scores[None]
+        keypoints = self._activation_pts_to_kpts(keypoints)
+        return keypoints, scores

mmpose/codecs/regression_label.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+@KEYPOINT_CODECS.register_module()
+class RegressionLabel(BaseKeypointCodec):
+    r"""Generate keypoint coordinates.
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+    Encoded:
+        - keypoint_labels (np.ndarray): The normalized regression labels in
+            shape (N, K, D) where D is 2 for 2d coordinates
+        - keypoint_weights (np.ndarray): The target weights in shape (N, K)
+    Args:
+        input_size (tuple): Input image size in [w, h]
+    """
+    label_mapping_table = dict(
+        keypoint_labels='keypoint_labels',
+        keypoint_weights='keypoint_weights',
+    )
+    def __init__(self, input_size: Tuple[int, int]) -> None:
+        super().__init__()
+        self.input_size = input_size
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None) -> dict:
+        """Encoding keypoints from input image space to normalized space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+        Returns:
+            dict:
+            - keypoint_labels (np.ndarray): The normalized regression labels in
+                shape (N, K, D) where D is 2 for 2d coordinates
+            - keypoint_weights (np.ndarray): The target weights in shape
+                (N, K)
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        w, h = self.input_size
+        valid = ((keypoints >= 0) &
+                 (keypoints <= [w - 1, h - 1])).all(axis=-1) & (
+                     keypoints_visible > 0.5)
+        keypoint_labels = (keypoints / np.array([w, h])).astype(np.float32)
+        keypoint_weights = np.where(valid, 1., 0.).astype(np.float32)
+        encoded = dict(
+            keypoint_labels=keypoint_labels, keypoint_weights=keypoint_weights)
+        return encoded
+    def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from normalized space to input image
+        space.
+        Args:
+            encoded (np.ndarray): Coordinates in shape (N, K, D)
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded coordinates in shape (N, K, D)
+            - scores (np.ndarray): The keypoint scores in shape (N, K).
+                It usually represents the confidence of the keypoint prediction
+        """
+        if encoded.shape[-1] == 2:
+            N, K, _ = encoded.shape
+            normalized_coords = encoded.copy()
+            scores = np.ones((N, K), dtype=np.float32)
+        elif encoded.shape[-1] == 4:
+            # split coords and sigma if outputs contain output_sigma
+            normalized_coords = encoded[..., :2].copy()
+            output_sigma = encoded[..., 2:4].copy()
+            scores = (1 - output_sigma).mean(axis=-1)
+        else:
+            raise ValueError(
+                'Keypoint dimension should be 2 or 4 (with sigma), '
+                f'but got {encoded.shape[-1]}')
+        w, h = self.input_size
+        keypoints = normalized_coords * np.array([w, h])
+        return keypoints, scores

mmpose/codecs/simcc_label.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import product
+from typing import Optional, Tuple, Union
+import numpy as np
+from mmpose.codecs.utils import get_simcc_maximum
+from mmpose.codecs.utils.refinement import refine_simcc_dark
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+@KEYPOINT_CODECS.register_module()
+class SimCCLabel(BaseKeypointCodec):
+    r"""Generate keypoint representation via "SimCC" approach.
+    See the paper: `SimCC: a Simple Coordinate Classification Perspective for
+    Human Pose Estimation`_ by Li et al (2022) for more details.
+    Old name: SimDR
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+    Encoded:
+        - keypoint_x_labels (np.ndarray): The generated SimCC label for x-axis.
+            The label shape is (N, K, Wx) if ``smoothing_type=='gaussian'``
+            and (N, K) if `smoothing_type=='standard'``, where
+            :math:`Wx=w*simcc_split_ratio`
+        - keypoint_y_labels (np.ndarray): The generated SimCC label for y-axis.
+            The label shape is (N, K, Wy) if ``smoothing_type=='gaussian'``
+            and (N, K) if `smoothing_type=='standard'``, where
+            :math:`Wy=h*simcc_split_ratio`
+        - keypoint_weights (np.ndarray): The target weights in shape (N, K)
+    Args:
+        input_size (tuple): Input image size in [w, h]
+        smoothing_type (str): The SimCC label smoothing strategy. Options are
+            ``'gaussian'`` and ``'standard'``. Defaults to ``'gaussian'``
+        sigma (float | int | tuple): The sigma value in the Gaussian SimCC
+            label. Defaults to 6.0
+        simcc_split_ratio (float): The ratio of the label size to the input
+            size. For example, if the input width is ``w``, the x label size
+            will be :math:`w*simcc_split_ratio`. Defaults to 2.0
+        label_smooth_weight (float): Label Smoothing weight. Defaults to 0.0
+        normalize (bool): Whether to normalize the heatmaps. Defaults to True.
+        use_dark (bool): Whether to use the DARK post processing. Defaults to
+            False.
+        decode_visibility (bool): Whether to decode the visibility. Defaults
+            to False.
+        decode_beta (float): The beta value for decoding visibility. Defaults
+            to 150.0.
+    .. _`SimCC: a Simple Coordinate Classification Perspective for Human Pose
+    Estimation`: https://arxiv.org/abs/2107.03332
+    """
+    label_mapping_table = dict(
+        keypoint_x_labels='keypoint_x_labels',
+        keypoint_y_labels='keypoint_y_labels',
+        keypoint_weights='keypoint_weights',
+    )
+    def __init__(
+        self,
+        input_size: Tuple[int, int],
+        smoothing_type: str = 'gaussian',
+        sigma: Union[float, int, Tuple[float]] = 6.0,
+        simcc_split_ratio: float = 2.0,
+        label_smooth_weight: float = 0.0,
+        normalize: bool = True,
+        use_dark: bool = False,
+        decode_visibility: bool = False,
+        decode_beta: float = 150.0,
+    ) -> None:
+        super().__init__()
+        self.input_size = input_size
+        self.smoothing_type = smoothing_type
+        self.simcc_split_ratio = simcc_split_ratio
+        self.label_smooth_weight = label_smooth_weight
+        self.normalize = normalize
+        self.use_dark = use_dark
+        self.decode_visibility = decode_visibility
+        self.decode_beta = decode_beta
+        if isinstance(sigma, (float, int)):
+            self.sigma = np.array([sigma, sigma])
+        else:
+            self.sigma = np.array(sigma)
+        if self.smoothing_type not in {'gaussian', 'standard'}:
+            raise ValueError(
+                f'{self.__class__.__name__} got invalid `smoothing_type` value'
+                f'{self.smoothing_type}. Should be one of '
+                '{"gaussian", "standard"}')
+        if self.smoothing_type == 'gaussian' and self.label_smooth_weight > 0:
+            raise ValueError('Attribute `label_smooth_weight` is only '
+                             'used for `standard` mode.')
+        if self.label_smooth_weight < 0.0 or self.label_smooth_weight > 1.0:
+            raise ValueError('`label_smooth_weight` should be in range [0, 1]')
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None) -> dict:
+        """Encoding keypoints into SimCC labels. Note that the original
+        keypoint coordinates should be in the input image space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+        Returns:
+            dict:
+            - keypoint_x_labels (np.ndarray): The generated SimCC label for
+                x-axis.
+                The label shape is (N, K, Wx) if ``smoothing_type=='gaussian'``
+                and (N, K) if `smoothing_type=='standard'``, where
+                :math:`Wx=w*simcc_split_ratio`
+            - keypoint_y_labels (np.ndarray): The generated SimCC label for
+                y-axis.
+                The label shape is (N, K, Wy) if ``smoothing_type=='gaussian'``
+                and (N, K) if `smoothing_type=='standard'``, where
+                :math:`Wy=h*simcc_split_ratio`
+            - keypoint_weights (np.ndarray): The target weights in shape
+                (N, K)
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        if self.smoothing_type == 'gaussian':
+            x_labels, y_labels, keypoint_weights = self._generate_gaussian(
+                keypoints, keypoints_visible)
+        elif self.smoothing_type == 'standard':
+            x_labels, y_labels, keypoint_weights = self._generate_standard(
+                keypoints, keypoints_visible)
+        else:
+            raise ValueError(
+                f'{self.__class__.__name__} got invalid `smoothing_type` value'
+                f'{self.smoothing_type}. Should be one of '
+                '{"gaussian", "standard"}')
+        encoded = dict(
+            keypoint_x_labels=x_labels,
+            keypoint_y_labels=y_labels,
+            keypoint_weights=keypoint_weights)
+        return encoded
+    def decode(self, simcc_x: np.ndarray,
+               simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from SimCC representations. The decoded
+        coordinates are in the input image space.
+        Args:
+            encoded (Tuple[np.ndarray, np.ndarray]): SimCC labels for x-axis
+                and y-axis
+            simcc_x (np.ndarray): SimCC label for x-axis
+            simcc_y (np.ndarray): SimCC label for y-axis
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded coordinates in shape (N, K, D)
+            - socres (np.ndarray): The keypoint scores in shape (N, K).
+                It usually represents the confidence of the keypoint prediction
+        """
+        keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
+        # Unsqueeze the instance dimension for single-instance results
+        if keypoints.ndim == 2:
+            keypoints = keypoints[None, :]
+            scores = scores[None, :]
+        if self.use_dark:
+            x_blur = int((self.sigma[0] * 20 - 7) // 3)
+            y_blur = int((self.sigma[1] * 20 - 7) // 3)
+            x_blur -= int((x_blur % 2) == 0)
+            y_blur -= int((y_blur % 2) == 0)
+            keypoints[:, :, 0] = refine_simcc_dark(keypoints[:, :, 0], simcc_x,
+                                                   x_blur)
+            keypoints[:, :, 1] = refine_simcc_dark(keypoints[:, :, 1], simcc_y,
+                                                   y_blur)
+        keypoints /= self.simcc_split_ratio
+        if self.decode_visibility:
+            _, visibility = get_simcc_maximum(
+                simcc_x * self.decode_beta * self.sigma[0],
+                simcc_y * self.decode_beta * self.sigma[1],
+                apply_softmax=True)
+            return keypoints, (scores, visibility)
+        else:
+            return keypoints, scores
+    def _map_coordinates(
+        self,
+        keypoints: np.ndarray,
+        keypoints_visible: Optional[np.ndarray] = None
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Mapping keypoint coordinates into SimCC space."""
+        keypoints_split = keypoints.copy()
+        keypoints_split = np.around(keypoints_split * self.simcc_split_ratio)
+        keypoints_split = keypoints_split.astype(np.int64)
+        keypoint_weights = keypoints_visible.copy()
+        return keypoints_split, keypoint_weights
+    def _generate_standard(
+        self,
+        keypoints: np.ndarray,
+        keypoints_visible: Optional[np.ndarray] = None
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Encoding keypoints into SimCC labels with Standard Label Smoothing
+        strategy.
+        Labels will be one-hot vectors if self.label_smooth_weight==0.0
+        """
+        N, K, _ = keypoints.shape
+        w, h = self.input_size
+        W = np.around(w * self.simcc_split_ratio).astype(int)
+        H = np.around(h * self.simcc_split_ratio).astype(int)
+        keypoints_split, keypoint_weights = self._map_coordinates(
+            keypoints, keypoints_visible)
+        target_x = np.zeros((N, K, W), dtype=np.float32)
+        target_y = np.zeros((N, K, H), dtype=np.float32)
+        for n, k in product(range(N), range(K)):
+            # skip unlabled keypoints
+            if keypoints_visible[n, k] < 0.5:
+                continue
+            # get center coordinates
+            mu_x, mu_y = keypoints_split[n, k].astype(np.int64)
+            # detect abnormal coords and assign the weight 0
+            if mu_x >= W or mu_y >= H or mu_x < 0 or mu_y < 0:
+                keypoint_weights[n, k] = 0
+                continue
+            if self.label_smooth_weight > 0:
+                target_x[n, k] = self.label_smooth_weight / (W - 1)
+                target_y[n, k] = self.label_smooth_weight / (H - 1)
+            target_x[n, k, mu_x] = 1.0 - self.label_smooth_weight
+            target_y[n, k, mu_y] = 1.0 - self.label_smooth_weight
+        return target_x, target_y, keypoint_weights
+    def _generate_gaussian(
+        self,
+        keypoints: np.ndarray,
+        keypoints_visible: Optional[np.ndarray] = None
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Encoding keypoints into SimCC labels with Gaussian Label Smoothing
+        strategy."""
+        N, K, _ = keypoints.shape
+        w, h = self.input_size
+        W = np.around(w * self.simcc_split_ratio).astype(int)
+        H = np.around(h * self.simcc_split_ratio).astype(int)
+        keypoints_split, keypoint_weights = self._map_coordinates(
+            keypoints, keypoints_visible)
+        target_x = np.zeros((N, K, W), dtype=np.float32)
+        target_y = np.zeros((N, K, H), dtype=np.float32)
+        # 3-sigma rule
+        radius = self.sigma * 3
+        # xy grid
+        x = np.arange(0, W, 1, dtype=np.float32)
+        y = np.arange(0, H, 1, dtype=np.float32)
+        for n, k in product(range(N), range(K)):
+            # skip unlabled keypoints
+            if keypoints_visible[n, k] < 0.5:
+                continue
+            mu = keypoints_split[n, k]
+            # check that the gaussian has in-bounds part
+            left, top = mu - radius
+            right, bottom = mu + radius + 1
+            if left >= W or top >= H or right < 0 or bottom < 0:
+                keypoint_weights[n, k] = 0
+                continue
+            mu_x, mu_y = mu
+            target_x[n, k] = np.exp(-((x - mu_x)**2) / (2 * self.sigma[0]**2))
+            target_y[n, k] = np.exp(-((y - mu_y)**2) / (2 * self.sigma[1]**2))
+        if self.normalize:
+            norm_value = self.sigma * np.sqrt(np.pi * 2)
+            target_x /= norm_value[0]
+            target_y /= norm_value[1]
+        return target_x, target_y, keypoint_weights

mmpose/codecs/spr.py ADDED Viewed

	@@ -0,0 +1,306 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+from torch import Tensor
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils import (batch_heatmap_nms, generate_displacement_heatmap,
+                    generate_gaussian_heatmaps, get_diagonal_lengths,
+                    get_instance_root)
+@KEYPOINT_CODECS.register_module()
+class SPR(BaseKeypointCodec):
+    """Encode/decode keypoints with Structured Pose Representation (SPR).
+    See the paper `Single-stage multi-person pose machines`_
+    by Nie et al (2017) for details
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+        - heatmap size: [W, H]
+    Encoded:
+        - heatmaps (np.ndarray): The generated heatmap in shape (1, H, W)
+            where [W, H] is the `heatmap_size`. If the keypoint heatmap is
+            generated together, the output heatmap shape is (K+1, H, W)
+        - heatmap_weights (np.ndarray): The target weights for heatmaps which
+            has same shape with heatmaps.
+        - displacements (np.ndarray): The dense keypoint displacement in
+            shape (K*2, H, W).
+        - displacement_weights (np.ndarray): The target weights for heatmaps
+            which has same shape with displacements.
+    Args:
+        input_size (tuple): Image size in [w, h]
+        heatmap_size (tuple): Heatmap size in [W, H]
+        sigma (float or tuple, optional): The sigma values of the Gaussian
+            heatmaps. If sigma is a tuple, it includes both sigmas for root
+            and keypoint heatmaps. ``None`` means the sigmas are computed
+            automatically from the heatmap size. Defaults to ``None``
+        generate_keypoint_heatmaps (bool): Whether to generate Gaussian
+            heatmaps for each keypoint. Defaults to ``False``
+        root_type (str): The method to generate the instance root. Options
+            are:
+            - ``'kpt_center'``: Average coordinate of all visible keypoints.
+            - ``'bbox_center'``: Center point of bounding boxes outlined by
+                all visible keypoints.
+            Defaults to ``'kpt_center'``
+        minimal_diagonal_length (int or float): The threshold of diagonal
+            length of instance bounding box. Small instances will not be
+            used in training. Defaults to 32
+        background_weight (float): Loss weight of background pixels.
+            Defaults to 0.1
+        decode_thr (float): The threshold of keypoint response value in
+            heatmaps. Defaults to 0.01
+        decode_nms_kernel (int): The kernel size of the NMS during decoding,
+            which should be an odd integer. Defaults to 5
+        decode_max_instances (int): The maximum number of instances
+            to decode. Defaults to 30
+    .. _`Single-stage multi-person pose machines`:
+        https://arxiv.org/abs/1908.09220
+    """
+    field_mapping_table = dict(
+        heatmaps='heatmaps',
+        heatmap_weights='heatmap_weights',
+        displacements='displacements',
+        displacement_weights='displacement_weights',
+    )
+    def __init__(
+        self,
+        input_size: Tuple[int, int],
+        heatmap_size: Tuple[int, int],
+        sigma: Optional[Union[float, Tuple[float]]] = None,
+        generate_keypoint_heatmaps: bool = False,
+        root_type: str = 'kpt_center',
+        minimal_diagonal_length: Union[int, float] = 5,
+        background_weight: float = 0.1,
+        decode_nms_kernel: int = 5,
+        decode_max_instances: int = 30,
+        decode_thr: float = 0.01,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.heatmap_size = heatmap_size
+        self.generate_keypoint_heatmaps = generate_keypoint_heatmaps
+        self.root_type = root_type
+        self.minimal_diagonal_length = minimal_diagonal_length
+        self.background_weight = background_weight
+        self.decode_nms_kernel = decode_nms_kernel
+        self.decode_max_instances = decode_max_instances
+        self.decode_thr = decode_thr
+        self.scale_factor = (np.array(input_size) /
+                             heatmap_size).astype(np.float32)
+        if sigma is None:
+            sigma = (heatmap_size[0] * heatmap_size[1])**0.5 / 32
+            if generate_keypoint_heatmaps:
+                # sigma for root heatmap and keypoint heatmaps
+                self.sigma = (sigma, sigma // 2)
+            else:
+                self.sigma = (sigma, )
+        else:
+            if not isinstance(sigma, (tuple, list)):
+                sigma = (sigma, )
+            if generate_keypoint_heatmaps:
+                assert len(sigma) == 2, 'sigma for keypoints must be given ' \
+                                        'if `generate_keypoint_heatmaps` ' \
+                                        'is True. e.g. sigma=(4, 2)'
+            self.sigma = sigma
+    def _get_heatmap_weights(self,
+                             heatmaps,
+                             fg_weight: float = 1,
+                             bg_weight: float = 0):
+        """Generate weight array for heatmaps.
+        Args:
+            heatmaps (np.ndarray): Root and keypoint (optional) heatmaps
+            fg_weight (float): Weight for foreground pixels. Defaults to 1.0
+            bg_weight (float): Weight for background pixels. Defaults to 0.0
+        Returns:
+            np.ndarray: Heatmap weight array in the same shape with heatmaps
+        """
+        heatmap_weights = np.ones(heatmaps.shape, dtype=np.float32) * bg_weight
+        heatmap_weights[heatmaps > 0] = fg_weight
+        return heatmap_weights
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None) -> dict:
+        """Encode keypoints into root heatmaps and keypoint displacement
+        fields. Note that the original keypoint coordinates should be in the
+        input image space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+        Returns:
+            dict:
+            - heatmaps (np.ndarray): The generated heatmap in shape
+                (1, H, W) where [W, H] is the `heatmap_size`. If keypoint
+                heatmaps are generated together, the shape is (K+1, H, W)
+            - heatmap_weights (np.ndarray): The pixel-wise weight for heatmaps
+                 which has same shape with `heatmaps`
+            - displacements (np.ndarray): The generated displacement fields in
+                shape (K*D, H, W). The vector on each pixels represents the
+                displacement of keypoints belong to the associated instance
+                from this pixel.
+            - displacement_weights (np.ndarray): The pixel-wise weight for
+                displacements which has same shape with `displacements`
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        # keypoint coordinates in heatmap
+        _keypoints = keypoints / self.scale_factor
+        # compute the root and scale of each instance
+        roots, roots_visible = get_instance_root(_keypoints, keypoints_visible,
+                                                 self.root_type)
+        diagonal_lengths = get_diagonal_lengths(_keypoints, keypoints_visible)
+        # discard the small instances
+        roots_visible[diagonal_lengths < self.minimal_diagonal_length] = 0
+        # generate heatmaps
+        heatmaps, _ = generate_gaussian_heatmaps(
+            heatmap_size=self.heatmap_size,
+            keypoints=roots[:, None],
+            keypoints_visible=roots_visible[:, None],
+            sigma=self.sigma[0])
+        heatmap_weights = self._get_heatmap_weights(
+            heatmaps, bg_weight=self.background_weight)
+        if self.generate_keypoint_heatmaps:
+            keypoint_heatmaps, _ = generate_gaussian_heatmaps(
+                heatmap_size=self.heatmap_size,
+                keypoints=_keypoints,
+                keypoints_visible=keypoints_visible,
+                sigma=self.sigma[1])
+            keypoint_heatmaps_weights = self._get_heatmap_weights(
+                keypoint_heatmaps, bg_weight=self.background_weight)
+            heatmaps = np.concatenate((keypoint_heatmaps, heatmaps), axis=0)
+            heatmap_weights = np.concatenate(
+                (keypoint_heatmaps_weights, heatmap_weights), axis=0)
+        # generate displacements
+        displacements, displacement_weights = \
+            generate_displacement_heatmap(
+                self.heatmap_size,
+                _keypoints,
+                keypoints_visible,
+                roots,
+                roots_visible,
+                diagonal_lengths,
+                self.sigma[0],
+            )
+        encoded = dict(
+            heatmaps=heatmaps,
+            heatmap_weights=heatmap_weights,
+            displacements=displacements,
+            displacement_weights=displacement_weights)
+        return encoded
+    def decode(self, heatmaps: Tensor,
+               displacements: Tensor) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode the keypoint coordinates from heatmaps and displacements. The
+        decoded keypoint coordinates are in the input image space.
+        Args:
+            heatmaps (Tensor): Encoded root and keypoints (optional) heatmaps
+                in shape (1, H, W) or (K+1, H, W)
+            displacements (Tensor): Encoded keypoints displacement fields
+                in shape (K*D, H, W)
+        Returns:
+            tuple:
+            - keypoints (Tensor): Decoded keypoint coordinates in shape
+                (N, K, D)
+            - scores (tuple):
+                - root_scores (Tensor): The root scores in shape (N, )
+                - keypoint_scores (Tensor): The keypoint scores in
+                    shape (N, K). If keypoint heatmaps are not generated,
+                    `keypoint_scores` will be `None`
+        """
+        # heatmaps, displacements = encoded
+        _k, h, w = displacements.shape
+        k = _k // 2
+        displacements = displacements.view(k, 2, h, w)
+        # convert displacements to a dense keypoint prediction
+        y, x = torch.meshgrid(torch.arange(h), torch.arange(w))
+        regular_grid = torch.stack([x, y], dim=0).to(displacements)
+        posemaps = (regular_grid[None] + displacements).flatten(2)
+        # find local maximum on root heatmap
+        root_heatmap_peaks = batch_heatmap_nms(heatmaps[None, -1:],
+                                               self.decode_nms_kernel)
+        root_scores, pos_idx = root_heatmap_peaks.flatten().topk(
+            self.decode_max_instances)
+        mask = root_scores > self.decode_thr
+        root_scores, pos_idx = root_scores[mask], pos_idx[mask]
+        keypoints = posemaps[:, :, pos_idx].permute(2, 0, 1).contiguous()
+        if self.generate_keypoint_heatmaps and heatmaps.shape[0] == 1 + k:
+            # compute scores for each keypoint
+            keypoint_scores = self.get_keypoint_scores(heatmaps[:k], keypoints)
+        else:
+            keypoint_scores = None
+        keypoints = torch.cat([
+            kpt * self.scale_factor[i]
+            for i, kpt in enumerate(keypoints.split(1, -1))
+        ],
+                              dim=-1)
+        return keypoints, (root_scores, keypoint_scores)
+    def get_keypoint_scores(self, heatmaps: Tensor, keypoints: Tensor):
+        """Calculate the keypoint scores with keypoints heatmaps and
+        coordinates.
+        Args:
+            heatmaps (Tensor): Keypoint heatmaps in shape (K, H, W)
+            keypoints (Tensor): Keypoint coordinates in shape (N, K, D)
+        Returns:
+            Tensor: Keypoint scores in [N, K]
+        """
+        k, h, w = heatmaps.shape
+        keypoints = torch.stack((
+            keypoints[..., 0] / (w - 1) * 2 - 1,
+            keypoints[..., 1] / (h - 1) * 2 - 1,
+        ),
+                                dim=-1)
+        keypoints = keypoints.transpose(0, 1).unsqueeze(1).contiguous()
+        keypoint_scores = torch.nn.functional.grid_sample(
+            heatmaps.unsqueeze(1), keypoints,
+            padding_mode='border').view(k, -1).transpose(0, 1).contiguous()
+        return keypoint_scores

mmpose/codecs/udp_heatmap.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+import cv2
+import numpy as np
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils import (generate_offset_heatmap, generate_udp_gaussian_heatmaps,
+                    get_heatmap_maximum, refine_keypoints_dark_udp)
+@KEYPOINT_CODECS.register_module()
+class UDPHeatmap(BaseKeypointCodec):
+    r"""Generate keypoint heatmaps by Unbiased Data Processing (UDP).
+    See the paper: `The Devil is in the Details: Delving into Unbiased Data
+    Processing for Human Pose Estimation`_ by Huang et al (2020) for details.
+    Note:
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+        - heatmap size: [W, H]
+    Encoded:
+        - heatmap (np.ndarray): The generated heatmap in shape (C_out, H, W)
+            where [W, H] is the `heatmap_size`, and the C_out is the output
+            channel number which depends on the `heatmap_type`. If
+            `heatmap_type=='gaussian'`, C_out equals to keypoint number K;
+            if `heatmap_type=='combined'`, C_out equals to K*3
+            (x_offset, y_offset and class label)
+        - keypoint_weights (np.ndarray): The target weights in shape (K,)
+    Args:
+        input_size (tuple): Image size in [w, h]
+        heatmap_size (tuple): Heatmap size in [W, H]
+        heatmap_type (str): The heatmap type to encode the keypoitns. Options
+            are:
+            - ``'gaussian'``: Gaussian heatmap
+            - ``'combined'``: Combination of a binary label map and offset
+                maps for X and Y axes.
+        sigma (float): The sigma value of the Gaussian heatmap when
+            ``heatmap_type=='gaussian'``. Defaults to 2.0
+        radius_factor (float): The radius factor of the binary label
+            map when ``heatmap_type=='combined'``. The positive region is
+            defined as the neighbor of the keypoit with the radius
+            :math:`r=radius_factor*max(W, H)`. Defaults to 0.0546875
+        blur_kernel_size (int): The Gaussian blur kernel size of the heatmap
+            modulation in DarkPose. Defaults to 11
+    .. _`The Devil is in the Details: Delving into Unbiased Data Processing for
+    Human Pose Estimation`: https://arxiv.org/abs/1911.07524
+    """
+    label_mapping_table = dict(keypoint_weights='keypoint_weights', )
+    field_mapping_table = dict(heatmaps='heatmaps', )
+    def __init__(self,
+                 input_size: Tuple[int, int],
+                 heatmap_size: Tuple[int, int],
+                 heatmap_type: str = 'gaussian',
+                 sigma: float = 2.,
+                 radius_factor: float = 0.0546875,
+                 blur_kernel_size: int = 11,
+                 increase_sigma_with_padding=False,
+                 amap_scale: float = 1.0,
+                 normalize=None,
+                 ) -> None:
+        super().__init__()
+        self.input_size = np.array(input_size)
+        self.heatmap_size = np.array(heatmap_size)
+        self.sigma = sigma
+        self.radius_factor = radius_factor
+        self.heatmap_type = heatmap_type
+        self.blur_kernel_size = blur_kernel_size
+        self.increase_sigma_with_padding = increase_sigma_with_padding
+        self.normalize = normalize
+        self.amap_size = self.input_size * amap_scale
+        self.scale_factor = ((self.amap_size - 1) /
+                             (self.heatmap_size - 1)).astype(np.float32)
+        self.input_center = self.input_size / 2
+        self.top_left = self.input_center - self.amap_size / 2
+        if self.heatmap_type not in {'gaussian', 'combined'}:
+            raise ValueError(
+                f'{self.__class__.__name__} got invalid `heatmap_type` value'
+                f'{self.heatmap_type}. Should be one of '
+                '{"gaussian", "combined"}')
+    def _kpts_to_activation_pts(self, keypoints: np.ndarray) -> np.ndarray:
+        """
+        Transform the keypoint coordinates to the activation space.
+        In the original UDPHeatmap, activation map is the same as the input image space with
+        different resolution but in this case we allow the activation map to have different
+        size (padding) than the input image space.
+        Centers of activation map and input image space are aligned.
+        """
+        transformed_keypoints = keypoints - self.top_left
+        transformed_keypoints = transformed_keypoints / self.scale_factor
+        return transformed_keypoints
+    def _activation_pts_to_kpts(self, keypoints: np.ndarray) -> np.ndarray:
+        """
+        Transform the points in activation map to the keypoint coordinates.
+        In the original UDPHeatmap, activation map is the same as the input image space with
+        different resolution but in this case we allow the activation map to have different
+        size (padding) than the input image space.
+        Centers of activation map and input image space are aligned.
+        """
+        W, H = self.heatmap_size
+        transformed_keypoints = keypoints / [W - 1, H - 1] * self.amap_size
+        transformed_keypoints += self.top_left
+        return transformed_keypoints
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None,
+               id_similarity: Optional[float] = 0.0,
+               keypoints_visibility: Optional[np.ndarray] = None) -> dict:
+        """Encode keypoints into heatmaps. Note that the original keypoint
+        coordinates should be in the input image space.
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+            keypoints_visible (np.ndarray): Keypoint visibilities in shape
+                (N, K)
+            id_similarity (float): The usefulness of the identity information
+                for the whole pose. Defaults to 0.0
+            keypoints_visibility (np.ndarray): The visibility bit for each
+                keypoint (N, K). Defaults to None
+        Returns:
+            dict:
+            - heatmap (np.ndarray): The generated heatmap in shape
+                (C_out, H, W) where [W, H] is the `heatmap_size`, and the
+                C_out is the output channel number which depends on the
+                `heatmap_type`. If `heatmap_type=='gaussian'`, C_out equals to
+                keypoint number K; if `heatmap_type=='combined'`, C_out
+                equals to K*3 (x_offset, y_offset and class label)
+            - keypoint_weights (np.ndarray): The target weights in shape
+                (K,)
+        """
+        assert keypoints.shape[0] == 1, (
+            f'{self.__class__.__name__} only support single-instance '
+            'keypoint encoding')
+        if keypoints_visibility is None:
+            keypoints_visibility = np.zeros(keypoints.shape[:2], dtype=np.float32)
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        if self.heatmap_type == 'gaussian':
+            heatmaps, keypoint_weights = generate_udp_gaussian_heatmaps(
+                heatmap_size=self.heatmap_size,
+                keypoints=self._kpts_to_activation_pts(keypoints),
+                keypoints_visible=keypoints_visible,
+                sigma=self.sigma,
+                keypoints_visibility=keypoints_visibility,
+                increase_sigma_with_padding=self.increase_sigma_with_padding)
+        elif self.heatmap_type == 'combined':
+            heatmaps, keypoint_weights = generate_offset_heatmap(
+                heatmap_size=self.heatmap_size,
+                keypoints=self._kpts_to_activation_pts(keypoints),
+                keypoints_visible=keypoints_visible,
+                radius_factor=self.radius_factor)
+        else:
+            raise ValueError(
+                f'{self.__class__.__name__} got invalid `heatmap_type` value'
+                f'{self.heatmap_type}. Should be one of '
+                '{"gaussian", "combined"}')
+        if self.normalize is not None:
+            heatmaps_sum = np.sum(heatmaps, axis=(1, 2), keepdims=False)
+            mask = heatmaps_sum > 0
+            heatmaps[mask, :, :] = heatmaps[mask, :, :] / (heatmaps_sum[mask, None, None] + np.finfo(np.float32).eps)
+            heatmaps = heatmaps * self.normalize
+        annotated = keypoints_visible > 0
+        heatmap_keypoints = self._kpts_to_activation_pts(keypoints)
+        in_image = np.logical_and(
+            heatmap_keypoints[:, :, 0] >= 0,
+            heatmap_keypoints[:, :, 0] < self.heatmap_size[0],
+        )
+        in_image = np.logical_and(
+            in_image,
+            heatmap_keypoints[:, :, 1] >= 0,
+        )
+        in_image = np.logical_and(
+            in_image,
+            heatmap_keypoints[:, :, 1] < self.heatmap_size[1],
+        )
+        encoded = dict(
+            heatmaps=heatmaps,
+            keypoint_weights=keypoint_weights,
+            annotated=annotated,
+            in_image=in_image,
+            keypoints_scaled=keypoints,
+            heatmap_keypoints=heatmap_keypoints,
+            identification_similarity=id_similarity,
+        )
+        return encoded
+    def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from heatmaps. The decoded keypoint
+        coordinates are in the input image space.
+        Args:
+            encoded (np.ndarray): Heatmaps in shape (K, H, W)
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded keypoint coordinates in shape
+                (N, K, D)
+            - scores (np.ndarray): The keypoint scores in shape (N, K). It
+                usually represents the confidence of the keypoint prediction
+        """
+        heatmaps = encoded.copy()
+        if self.heatmap_type == 'gaussian':
+            keypoints, scores = get_heatmap_maximum(heatmaps)
+            # unsqueeze the instance dimension for single-instance results
+            keypoints = keypoints[None]
+            scores = scores[None]
+            keypoints = refine_keypoints_dark_udp(
+                keypoints, heatmaps, blur_kernel_size=self.blur_kernel_size)
+        elif self.heatmap_type == 'combined':
+            _K, H, W = heatmaps.shape
+            K = _K // 3
+            for cls_heatmap in heatmaps[::3]:
+                # Apply Gaussian blur on classification maps
+                ks = 2 * self.blur_kernel_size + 1
+                cv2.GaussianBlur(cls_heatmap, (ks, ks), 0, cls_heatmap)
+            # valid radius
+            radius = self.radius_factor * max(W, H)
+            x_offset = heatmaps[1::3].flatten() * radius
+            y_offset = heatmaps[2::3].flatten() * radius
+            keypoints, scores = get_heatmap_maximum(heatmaps=heatmaps[::3])
+            index = (keypoints[..., 0] + keypoints[..., 1] * W).flatten()
+            index += W * H * np.arange(0, K)
+            index = index.astype(int)
+            keypoints += np.stack((x_offset[index], y_offset[index]), axis=-1)
+            # unsqueeze the instance dimension for single-instance results
+            keypoints = keypoints[None].astype(np.float32)
+            scores = scores[None]
+        keypoints = self._activation_pts_to_kpts(keypoints)
+        return keypoints, scores

mmpose/codecs/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .camera_image_projection import (camera_to_image_coord, camera_to_pixel,
+                                      pixel_to_camera)
+from .gaussian_heatmap import (generate_3d_gaussian_heatmaps,
+                               generate_gaussian_heatmaps,
+                               generate_udp_gaussian_heatmaps,
+                               generate_unbiased_gaussian_heatmaps,
+                               generate_onehot_heatmaps)
+from .instance_property import (get_diagonal_lengths, get_instance_bbox,
+                                get_instance_root)
+from .offset_heatmap import (generate_displacement_heatmap,
+                             generate_offset_heatmap)
+from .post_processing import (batch_heatmap_nms, gaussian_blur,
+                              gaussian_blur1d, get_heatmap_3d_maximum,
+                              get_heatmap_maximum, get_simcc_maximum,
+                              get_simcc_normalized, get_heatmap_expected_value)
+from .refinement import (refine_keypoints, refine_keypoints_dark,
+                         refine_keypoints_dark_udp, refine_simcc_dark)
+from .oks_map import generate_oks_maps
+__all__ = [
+    'generate_gaussian_heatmaps', 'generate_udp_gaussian_heatmaps',
+    'generate_unbiased_gaussian_heatmaps', 'gaussian_blur',
+    'get_heatmap_maximum', 'get_simcc_maximum', 'generate_offset_heatmap',
+    'batch_heatmap_nms', 'refine_keypoints', 'refine_keypoints_dark',
+    'refine_keypoints_dark_udp', 'generate_displacement_heatmap',
+    'refine_simcc_dark', 'gaussian_blur1d', 'get_diagonal_lengths',
+    'get_instance_root', 'get_instance_bbox', 'get_simcc_normalized',
+    'camera_to_image_coord', 'camera_to_pixel', 'pixel_to_camera',
+    'get_heatmap_3d_maximum', 'generate_3d_gaussian_heatmaps',
+    'generate_oks_maps', 'get_heatmap_expected_value', 'generate_onehot_heatmaps'
+]

mmpose/codecs/utils/camera_image_projection.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+import numpy as np
+def camera_to_image_coord(root_index: int, kpts_3d_cam: np.ndarray,
+                          camera_param: Dict) -> Tuple[np.ndarray, np.ndarray]:
+    """Project keypoints from camera space to image space and calculate factor.
+    Args:
+        root_index (int): Index for root keypoint.
+        kpts_3d_cam (np.ndarray): Keypoint coordinates in camera space in
+            shape (N, K, D).
+        camera_param (dict): Parameters for the camera.
+    Returns:
+        tuple:
+        - kpts_3d_image (np.ndarray): Keypoint coordinates in image space in
+            shape (N, K, D).
+        - factor (np.ndarray): The scaling factor that maps keypoints from
+            image space to camera space in shape (N, ).
+    """
+    root = kpts_3d_cam[..., root_index, :]
+    tl_kpt = root.copy()
+    tl_kpt[..., :2] -= 1.0
+    br_kpt = root.copy()
+    br_kpt[..., :2] += 1.0
+    tl_kpt = np.reshape(tl_kpt, (-1, 3))
+    br_kpt = np.reshape(br_kpt, (-1, 3))
+    fx, fy = camera_param['f'] / 1000.
+    cx, cy = camera_param['c'] / 1000.
+    tl2d = camera_to_pixel(tl_kpt, fx, fy, cx, cy)
+    br2d = camera_to_pixel(br_kpt, fx, fy, cx, cy)
+    rectangle_3d_size = 2.0
+    kpts_3d_image = np.zeros_like(kpts_3d_cam)
+    kpts_3d_image[..., :2] = camera_to_pixel(kpts_3d_cam.copy(), fx, fy, cx,
+                                             cy)
+    ratio = (br2d[..., 0] - tl2d[..., 0] + 0.001) / rectangle_3d_size
+    factor = rectangle_3d_size / (br2d[..., 0] - tl2d[..., 0] + 0.001)
+    kpts_3d_depth = ratio[:, None] * (
+        kpts_3d_cam[..., 2] - kpts_3d_cam[..., root_index:root_index + 1, 2])
+    kpts_3d_image[..., 2] = kpts_3d_depth
+    return kpts_3d_image, factor
+def camera_to_pixel(kpts_3d: np.ndarray,
+                    fx: float,
+                    fy: float,
+                    cx: float,
+                    cy: float,
+                    shift: bool = False) -> np.ndarray:
+    """Project keypoints from camera space to image space.
+    Args:
+        kpts_3d (np.ndarray): Keypoint coordinates in camera space.
+        fx (float): x-coordinate of camera's focal length.
+        fy (float): y-coordinate of camera's focal length.
+        cx (float): x-coordinate of image center.
+        cy (float): y-coordinate of image center.
+        shift (bool): Whether to shift the coordinates by 1e-8.
+    Returns:
+        pose_2d (np.ndarray): Projected keypoint coordinates in image space.
+    """
+    if not shift:
+        pose_2d = kpts_3d[..., :2] / kpts_3d[..., 2:3]
+    else:
+        pose_2d = kpts_3d[..., :2] / (kpts_3d[..., 2:3] + 1e-8)
+    pose_2d[..., 0] *= fx
+    pose_2d[..., 1] *= fy
+    pose_2d[..., 0] += cx
+    pose_2d[..., 1] += cy
+    return pose_2d
+def pixel_to_camera(kpts_3d: np.ndarray, fx: float, fy: float, cx: float,
+                    cy: float) -> np.ndarray:
+    """Project keypoints from camera space to image space.
+    Args:
+        kpts_3d (np.ndarray): Keypoint coordinates in camera space.
+        fx (float): x-coordinate of camera's focal length.
+        fy (float): y-coordinate of camera's focal length.
+        cx (float): x-coordinate of image center.
+        cy (float): y-coordinate of image center.
+        shift (bool): Whether to shift the coordinates by 1e-8.
+    Returns:
+        pose_2d (np.ndarray): Projected keypoint coordinates in image space.
+    """
+    pose_2d = kpts_3d.copy()
+    pose_2d[..., 0] -= cx
+    pose_2d[..., 1] -= cy
+    pose_2d[..., 0] /= fx
+    pose_2d[..., 1] /= fy
+    pose_2d[..., 0] *= kpts_3d[..., 2]
+    pose_2d[..., 1] *= kpts_3d[..., 2]
+    return pose_2d

mmpose/codecs/utils/gaussian_heatmap.py ADDED Viewed

	@@ -0,0 +1,433 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import product
+from typing import Optional, Tuple, Union
+import numpy as np
+from scipy.spatial.distance import cdist
+def generate_3d_gaussian_heatmaps(
+    heatmap_size: Tuple[int, int, int],
+    keypoints: np.ndarray,
+    keypoints_visible: np.ndarray,
+    sigma: Union[float, Tuple[float], np.ndarray],
+    image_size: Tuple[int, int],
+    heatmap3d_depth_bound: float = 400.0,
+    joint_indices: Optional[list] = None,
+    max_bound: float = 1.0,
+    use_different_joint_weights: bool = False,
+    dataset_keypoint_weights: Optional[np.ndarray] = None
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate 3d gaussian heatmaps of keypoints.
+    Args:
+        heatmap_size (Tuple[int, int]): Heatmap size in [W, H, D]
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, C)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+        sigma (float or List[float]): A list of sigma values of the Gaussian
+            heatmap for each instance. If sigma is given as a single float
+            value, it will be expanded into a tuple
+        image_size (Tuple[int, int]): Size of input image.
+        heatmap3d_depth_bound (float): Boundary for 3d heatmap depth.
+            Default: 400.0.
+        joint_indices (List[int], optional): Indices of joints used for heatmap
+            generation. If None (default) is given, all joints will be used.
+            Default: ``None``.
+        max_bound (float): The maximal value of heatmap. Default: 1.0.
+        use_different_joint_weights (bool): Whether to use different joint
+            weights. Default: ``False``.
+        dataset_keypoint_weights (np.ndarray, optional): Keypoints weight in
+            shape (K, ).
+    Returns:
+        tuple:
+        - heatmaps (np.ndarray): The generated heatmap in shape
+            (K * D, H, W) where [W, H, D] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape
+            (N, K)
+    """
+    W, H, D = heatmap_size
+    # select the joints used for target generation
+    if joint_indices is not None:
+        keypoints = keypoints[:, joint_indices, ...]
+        keypoints_visible = keypoints_visible[:, joint_indices, ...]
+    N, K, _ = keypoints.shape
+    heatmaps = np.zeros([K, D, H, W], dtype=np.float32)
+    keypoint_weights = keypoints_visible.copy()
+    if isinstance(sigma, (int, float)):
+        sigma = (sigma, ) * N
+    for n in range(N):
+        # 3-sigma rule
+        radius = sigma[n] * 3
+        # joint location in heatmap coordinates
+        mu_x = keypoints[n, :, 0] * W / image_size[0]  # (K, )
+        mu_y = keypoints[n, :, 1] * H / image_size[1]
+        mu_z = (keypoints[n, :, 2] / heatmap3d_depth_bound + 0.5) * D
+        keypoint_weights[n, ...] = keypoint_weights[n, ...] * (mu_z >= 0) * (
+            mu_z < D)
+        if use_different_joint_weights:
+            keypoint_weights[
+                n] = keypoint_weights[n] * dataset_keypoint_weights
+        # xy grid
+        gaussian_size = 2 * radius + 1
+        # get neighboring voxels coordinates
+        x = y = z = np.arange(gaussian_size, dtype=np.float32) - radius
+        zz, yy, xx = np.meshgrid(z, y, x)
+        xx = np.expand_dims(xx, axis=0)
+        yy = np.expand_dims(yy, axis=0)
+        zz = np.expand_dims(zz, axis=0)
+        mu_x = np.expand_dims(mu_x, axis=(-1, -2, -3))
+        mu_y = np.expand_dims(mu_y, axis=(-1, -2, -3))
+        mu_z = np.expand_dims(mu_z, axis=(-1, -2, -3))
+        xx, yy, zz = xx + mu_x, yy + mu_y, zz + mu_z
+        local_size = xx.shape[1]
+        # round the coordinates
+        xx = xx.round().clip(0, W - 1)
+        yy = yy.round().clip(0, H - 1)
+        zz = zz.round().clip(0, D - 1)
+        # compute the target value near joints
+        gaussian = np.exp(-((xx - mu_x)**2 + (yy - mu_y)**2 + (zz - mu_z)**2) /
+                          (2 * sigma[n]**2))
+        # put the local target value to the full target heatmap
+        idx_joints = np.tile(
+            np.expand_dims(np.arange(K), axis=(-1, -2, -3)),
+            [1, local_size, local_size, local_size])
+        idx = np.stack([idx_joints, zz, yy, xx],
+                       axis=-1).astype(int).reshape(-1, 4)
+        heatmaps[idx[:, 0], idx[:, 1], idx[:, 2], idx[:, 3]] = np.maximum(
+            heatmaps[idx[:, 0], idx[:, 1], idx[:, 2], idx[:, 3]],
+            gaussian.reshape(-1))
+    heatmaps = (heatmaps * max_bound).reshape(-1, H, W)
+    return heatmaps, keypoint_weights
+def generate_gaussian_heatmaps(
+    heatmap_size: Tuple[int, int],
+    keypoints: np.ndarray,
+    keypoints_visible: np.ndarray,
+    sigma: Union[float, Tuple[float], np.ndarray],
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate gaussian heatmaps of keypoints.
+    Args:
+        heatmap_size (Tuple[int, int]): Heatmap size in [W, H]
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+        sigma (float or List[float]): A list of sigma values of the Gaussian
+            heatmap for each instance. If sigma is given as a single float
+            value, it will be expanded into a tuple
+    Returns:
+        tuple:
+        - heatmaps (np.ndarray): The generated heatmap in shape
+            (K, H, W) where [W, H] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape
+            (N, K)
+    """
+    N, K, _ = keypoints.shape
+    W, H = heatmap_size
+    heatmaps = np.zeros((K, H, W), dtype=np.float32)
+    keypoint_weights = keypoints_visible.copy()
+    if isinstance(sigma, (int, float)):
+        sigma = (sigma, ) * N
+    for n in range(N):
+        # 3-sigma rule
+        radius = sigma[n] * 3
+        # xy grid
+        gaussian_size = 2 * radius + 1
+        x = np.arange(0, gaussian_size, 1, dtype=np.float32)
+        y = x[:, None]
+        x0 = y0 = gaussian_size // 2
+        for k in range(K):
+            # skip unlabled keypoints
+            if keypoints_visible[n, k] < 0.5:
+                continue
+            # get gaussian center coordinates
+            mu = (keypoints[n, k] + 0.5).astype(np.int64)
+            # check that the gaussian has in-bounds part
+            left, top = (mu - radius).astype(np.int64)
+            right, bottom = (mu + radius + 1).astype(np.int64)
+            if left >= W or top >= H or right < 0 or bottom < 0:
+                keypoint_weights[n, k] = 0
+                continue
+            # The gaussian is not normalized,
+            # we want the center value to equal 1
+            gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma[n]**2))
+            # valid range in gaussian
+            g_x1 = max(0, -left)
+            g_x2 = min(W, right) - left
+            g_y1 = max(0, -top)
+            g_y2 = min(H, bottom) - top
+            # valid range in heatmap
+            h_x1 = max(0, left)
+            h_x2 = min(W, right)
+            h_y1 = max(0, top)
+            h_y2 = min(H, bottom)
+            heatmap_region = heatmaps[k, h_y1:h_y2, h_x1:h_x2]
+            gaussian_regsion = gaussian[g_y1:g_y2, g_x1:g_x2]
+            _ = np.maximum(
+                heatmap_region, gaussian_regsion, out=heatmap_region)
+    return heatmaps, keypoint_weights
+def generate_unbiased_gaussian_heatmaps(
+    heatmap_size: Tuple[int, int],
+    keypoints: np.ndarray,
+    keypoints_visible: np.ndarray,
+    sigma: float,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate gaussian heatmaps of keypoints using `Dark Pose`_.
+    Args:
+        heatmap_size (Tuple[int, int]): Heatmap size in [W, H]
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+    Returns:
+        tuple:
+        - heatmaps (np.ndarray): The generated heatmap in shape
+            (K, H, W) where [W, H] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape
+            (N, K)
+    .. _`Dark Pose`: https://arxiv.org/abs/1910.06278
+    """
+    N, K, _ = keypoints.shape
+    W, H = heatmap_size
+    heatmaps = np.zeros((K, H, W), dtype=np.float32)
+    keypoint_weights = keypoints_visible.copy()
+    # 3-sigma rule
+    radius = sigma * 3
+    # xy grid
+    x = np.arange(0, W, 1, dtype=np.float32)
+    y = np.arange(0, H, 1, dtype=np.float32)[:, None]
+    for n, k in product(range(N), range(K)):
+        # skip unlabled keypoints
+        if keypoints_visible[n, k] < 0.5:
+            continue
+        mu = keypoints[n, k]
+        # check that the gaussian has in-bounds part
+        left, top = mu - radius
+        right, bottom = mu + radius + 1
+        if left >= W or top >= H or right < 0 or bottom < 0:
+            keypoint_weights[n, k] = 0
+            continue
+        gaussian = np.exp(-((x - mu[0])**2 + (y - mu[1])**2) / (2 * sigma**2))
+        _ = np.maximum(gaussian, heatmaps[k], out=heatmaps[k])
+    return heatmaps, keypoint_weights
+def generate_udp_gaussian_heatmaps(
+    heatmap_size: Tuple[int, int],
+    keypoints: np.ndarray,
+    keypoints_visible: np.ndarray,
+    sigma,
+    keypoints_visibility: np.ndarray,
+    increase_sigma_with_padding: bool = False,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate gaussian heatmaps of keypoints using `UDP`_.
+    Args:
+        heatmap_size (Tuple[int, int]): Heatmap size in [W, H]
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+        sigma (float): The sigma value of the Gaussian heatmap
+        keypoints_visibility (np.ndarray): The visibility bit for each keypoint (N, K)
+        increase_sigma_with_padding (bool): Whether to increase the sigma
+            value with padding. Default: False
+    Returns:
+        tuple:
+        - heatmaps (np.ndarray): The generated heatmap in shape
+            (K, H, W) where [W, H] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape
+            (N, K)
+    .. _`UDP`: https://arxiv.org/abs/1911.07524
+    """
+    N, K, _ = keypoints.shape
+    W, H = heatmap_size
+    heatmaps = np.zeros((K, H, W), dtype=np.float32)
+    keypoint_weights = keypoints_visible.copy()
+    if isinstance(sigma, (int, float)):
+        scaled_sigmas = sigma * np.ones((N, K), dtype=np.float32)
+        sigmas = np.array([sigma] * K).reshape(1, -1).repeat(N, axis=0)
+    else:
+        scaled_sigmas = np.array(sigma).reshape(1, -1).repeat(N, axis=0)
+        sigmas = np.array(sigma).reshape(1, -1).repeat(N, axis=0)
+    scales_arr = np.ones((N, K), dtype=np.float32)
+    if increase_sigma_with_padding:
+        diag = np.sqrt(W**2 + H**2)
+        for n in range(N):
+            image_kpts = keypoints[n, :].squeeze()
+            vis_kpts = image_kpts[keypoints_visibility[n, :] > 0.5]
+            # Compute the distance between img_kpts and visible_kpts
+            if vis_kpts.size == 0:
+                min_dists = np.ones(image_kpts.shape[0]) * diag
+            else:
+                dists = cdist(image_kpts, vis_kpts, metric='euclidean')
+                min_dists = np.min(dists, axis=1)
+            scales = min_dists / diag * 2.0     # Maximum distance (diagonal) results in .0*sigma
+            scales_arr[n, :] = scales
+            scaled_sigmas[n, :] = sigma * (1+scales)
+    # print(scales_arr)
+    # print(scaled_sigmas)
+    for n, k in product(range(N), range(K)):
+        scaled_sigma = scaled_sigmas[n, k]
+        # skip unlabled keypoints
+        if keypoints_visible[n, k] < 0.5:
+            continue
+        # 3-sigma rule
+        radius = scaled_sigma * 3
+        # xy grid
+        gaussian_size = 2 * radius + 1
+        x = np.arange(0, gaussian_size, 1, dtype=np.float32)
+        y = x[:, None]
+        mu = (keypoints[n, k] + 0.5).astype(np.int64)
+        # check that the gaussian has in-bounds part
+        left, top = (mu - radius).round().astype(np.int64)
+        right, bottom = (mu + radius + 1).round().astype(np.int64)
+        # left, top = (mu - radius).astype(np.int64)
+        # right, bottom = (mu + radius + 1).astype(np.int64)
+        if left >= W or top >= H or right < 0 or bottom < 0:
+            keypoint_weights[n, k] = 0
+            continue
+        mu_ac = keypoints[n, k]
+        x0 = y0 = gaussian_size // 2
+        x0 += mu_ac[0] - mu[0]
+        y0 += mu_ac[1] - mu[1]
+        gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * scaled_sigma**2))
+        # Normalize Gaussian such that scaled_sigma = sigma is the norm
+        gaussian = gaussian / (scaled_sigma / sigmas[n, k])
+        # valid range in gaussian
+        g_x1 = max(0, -left)
+        g_x2 = min(W, right) - left
+        g_y1 = max(0, -top)
+        g_y2 = min(H, bottom) - top
+        # valid range in heatmap
+        h_x1 = max(0, left)
+        h_x2 = min(W, right)
+        h_y1 = max(0, top)
+        h_y2 = min(H, bottom)
+        # breakpoint()
+        heatmap_region = heatmaps[k, h_y1:h_y2, h_x1:h_x2]
+        gaussian_regsion = gaussian[g_y1:g_y2, g_x1:g_x2]
+        _ = np.maximum(heatmap_region, gaussian_regsion, out=heatmap_region)
+    return heatmaps, keypoint_weights
+def generate_onehot_heatmaps(
+    heatmap_size: Tuple[int, int],
+    keypoints: np.ndarray,
+    keypoints_visible: np.ndarray,
+    sigma,
+    keypoints_visibility: np.ndarray,
+    increase_sigma_with_padding: bool = False,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate gaussian heatmaps of keypoints using `UDP`_.
+    Args:
+        heatmap_size (Tuple[int, int]): Heatmap size in [W, H]
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+        sigma (float): The sigma value of the Gaussian heatmap
+        keypoints_visibility (np.ndarray): The visibility bit for each keypoint (N, K)
+        increase_sigma_with_padding (bool): Whether to increase the sigma
+            value with padding. Default: False
+    Returns:
+        tuple:
+        - heatmaps (np.ndarray): The generated heatmap in shape
+            (K, H, W) where [W, H] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape
+            (N, K)
+    .. _`UDP`: https://arxiv.org/abs/1911.07524
+    """
+    N, K, _ = keypoints.shape
+    W, H = heatmap_size
+    heatmaps = np.zeros((K, H, W), dtype=np.float32)
+    keypoint_weights = keypoints_visible.copy()
+    for n, k in product(range(N), range(K)):
+        # skip unlabled keypoints
+        if keypoints_visible[n, k] < 0.5:
+            continue
+        mu = (keypoints[n, k] + 0.5).astype(np.int64)
+        if mu[0] < 0 or mu[0] >= W or mu[1] < 0 or mu[1] >= H:
+            keypoint_weights[n, k] = 0
+            continue
+        heatmaps[k, mu[1], mu[0]] = 1
+    return heatmaps, keypoint_weights

mmpose/codecs/utils/instance_property.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+import numpy as np
+def get_instance_root(keypoints: np.ndarray,
+                      keypoints_visible: Optional[np.ndarray] = None,
+                      root_type: str = 'kpt_center') -> np.ndarray:
+    """Calculate the coordinates and visibility of instance roots.
+    Args:
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+        root_type (str): Calculation of instance roots which should
+            be one of the following options:
+                - ``'kpt_center'``: The roots' coordinates are the mean
+                    coordinates of visible keypoints
+                - ``'bbox_center'``: The roots' are the center of bounding
+                    boxes outlined by visible keypoints
+            Defaults to ``'kpt_center'``
+    Returns:
+        tuple
+        - roots_coordinate(np.ndarray): Coordinates of instance roots in
+            shape [N, D]
+        - roots_visible(np.ndarray): Visibility of instance roots in
+            shape [N]
+    """
+    roots_coordinate = np.zeros((keypoints.shape[0], 2), dtype=np.float32)
+    roots_visible = np.ones((keypoints.shape[0]), dtype=np.float32) * 2
+    for i in range(keypoints.shape[0]):
+        # collect visible keypoints
+        if keypoints_visible is not None:
+            visible_keypoints = keypoints[i][keypoints_visible[i] > 0]
+        else:
+            visible_keypoints = keypoints[i]
+        if visible_keypoints.size == 0:
+            roots_visible[i] = 0
+            continue
+        # compute the instance root with visible keypoints
+        if root_type == 'kpt_center':
+            roots_coordinate[i] = visible_keypoints.mean(axis=0)
+            roots_visible[i] = 1
+        elif root_type == 'bbox_center':
+            roots_coordinate[i] = (visible_keypoints.max(axis=0) +
+                                   visible_keypoints.min(axis=0)) / 2.0
+            roots_visible[i] = 1
+        else:
+            raise ValueError(
+                f'the value of `root_type` must be \'kpt_center\' or '
+                f'\'bbox_center\', but got \'{root_type}\'')
+    return roots_coordinate, roots_visible
+def get_instance_bbox(keypoints: np.ndarray,
+                      keypoints_visible: Optional[np.ndarray] = None
+                      ) -> np.ndarray:
+    """Calculate the pseudo instance bounding box from visible keypoints. The
+    bounding boxes are in the xyxy format.
+    Args:
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+    Returns:
+        np.ndarray: bounding boxes in [N, 4]
+    """
+    bbox = np.zeros((keypoints.shape[0], 4), dtype=np.float32)
+    for i in range(keypoints.shape[0]):
+        if keypoints_visible is not None:
+            visible_keypoints = keypoints[i][keypoints_visible[i] > 0]
+        else:
+            visible_keypoints = keypoints[i]
+        if visible_keypoints.size == 0:
+            continue
+        bbox[i, :2] = visible_keypoints.min(axis=0)
+        bbox[i, 2:] = visible_keypoints.max(axis=0)
+    return bbox
+def get_diagonal_lengths(keypoints: np.ndarray,
+                         keypoints_visible: Optional[np.ndarray] = None
+                         ) -> np.ndarray:
+    """Calculate the diagonal length of instance bounding box from visible
+    keypoints.
+    Args:
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+    Returns:
+        np.ndarray: bounding box diagonal length in [N]
+    """
+    pseudo_bbox = get_instance_bbox(keypoints, keypoints_visible)
+    pseudo_bbox = pseudo_bbox.reshape(-1, 2, 2)
+    h_w_diff = pseudo_bbox[:, 1] - pseudo_bbox[:, 0]
+    diagonal_length = np.sqrt(np.power(h_w_diff, 2).sum(axis=1))
+    return diagonal_length

mmpose/codecs/utils/offset_heatmap.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import product
+from typing import Tuple
+import numpy as np
+def generate_offset_heatmap(
+    heatmap_size: Tuple[int, int],
+    keypoints: np.ndarray,
+    keypoints_visible: np.ndarray,
+    radius_factor: float,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate offset heatmaps of keypoints, where each keypoint is
+    represented by 3 maps: one pixel-level class label map (1 for keypoint and
+    0 for non-keypoint) and 2 pixel-level offset maps for x and y directions
+    respectively.
+    Args:
+        heatmap_size (Tuple[int, int]): Heatmap size in [W, H]
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+        radius_factor (float): The radius factor of the binary label
+            map. The positive region is defined as the neighbor of the
+            keypoint with the radius :math:`r=radius_factor*max(W, H)`
+    Returns:
+        tuple:
+        - heatmap (np.ndarray): The generated heatmap in shape
+            (K*3, H, W) where [W, H] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape
+            (K,)
+    """
+    N, K, _ = keypoints.shape
+    W, H = heatmap_size
+    heatmaps = np.zeros((K, 3, H, W), dtype=np.float32)
+    keypoint_weights = keypoints_visible.copy()
+    # xy grid
+    x = np.arange(0, W, 1)
+    y = np.arange(0, H, 1)[:, None]
+    # positive area radius in the classification map
+    radius = radius_factor * max(W, H)
+    for n, k in product(range(N), range(K)):
+        if keypoints_visible[n, k] < 0.5:
+            continue
+        mu = keypoints[n, k]
+        x_offset = (mu[0] - x) / radius
+        y_offset = (mu[1] - y) / radius
+        heatmaps[k, 0] = np.where(x_offset**2 + y_offset**2 <= 1, 1., 0.)
+        heatmaps[k, 1] = x_offset
+        heatmaps[k, 2] = y_offset
+    heatmaps = heatmaps.reshape(K * 3, H, W)
+    return heatmaps, keypoint_weights
+def generate_displacement_heatmap(
+    heatmap_size: Tuple[int, int],
+    keypoints: np.ndarray,
+    keypoints_visible: np.ndarray,
+    roots: np.ndarray,
+    roots_visible: np.ndarray,
+    diagonal_lengths: np.ndarray,
+    radius: float,
+):
+    """Generate displacement heatmaps of keypoints, where each keypoint is
+    represented by 3 maps: one pixel-level class label map (1 for keypoint and
+    0 for non-keypoint) and 2 pixel-level offset maps for x and y directions
+    respectively.
+    Args:
+        heatmap_size (Tuple[int, int]): Heatmap size in [W, H]
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+        roots (np.ndarray): Coordinates of instance centers in shape (N, D).
+            The displacement fields of each instance will locate around its
+            center.
+        roots_visible (np.ndarray): Roots visibilities in shape (N,)
+        diagonal_lengths (np.ndarray): Diaginal length of the bounding boxes
+            of each instance in shape (N,)
+        radius (float): The radius factor of the binary label
+            map. The positive region is defined as the neighbor of the
+            keypoint with the radius :math:`r=radius_factor*max(W, H)`
+    Returns:
+        tuple:
+        - displacements (np.ndarray): The generated displacement map in
+            shape (K*2, H, W) where [W, H] is the `heatmap_size`
+        - displacement_weights (np.ndarray): The target weights in shape
+            (K*2, H, W)
+    """
+    N, K, _ = keypoints.shape
+    W, H = heatmap_size
+    displacements = np.zeros((K * 2, H, W), dtype=np.float32)
+    displacement_weights = np.zeros((K * 2, H, W), dtype=np.float32)
+    instance_size_map = np.zeros((H, W), dtype=np.float32)
+    for n in range(N):
+        if (roots_visible[n] < 1 or (roots[n, 0] < 0 or roots[n, 1] < 0)
+                or (roots[n, 0] >= W or roots[n, 1] >= H)):
+            continue
+        diagonal_length = diagonal_lengths[n]
+        for k in range(K):
+            if keypoints_visible[n, k] < 1 or keypoints[n, k, 0] < 0 \
+                or keypoints[n, k, 1] < 0 or keypoints[n, k, 0] >= W \
+                    or keypoints[n, k, 1] >= H:
+                continue
+            start_x = max(int(roots[n, 0] - radius), 0)
+            start_y = max(int(roots[n, 1] - radius), 0)
+            end_x = min(int(roots[n, 0] + radius), W)
+            end_y = min(int(roots[n, 1] + radius), H)
+            for x in range(start_x, end_x):
+                for y in range(start_y, end_y):
+                    if displacements[2 * k, y,
+                                     x] != 0 or displacements[2 * k + 1, y,
+                                                              x] != 0:
+                        if diagonal_length > instance_size_map[y, x]:
+                            # keep the gt displacement of smaller instance
+                            continue
+                    displacement_weights[2 * k:2 * k + 2, y,
+                                         x] = 1 / diagonal_length
+                    displacements[2 * k:2 * k + 2, y,
+                                  x] = keypoints[n, k] - [x, y]
+                    instance_size_map[y, x] = diagonal_length
+    return displacements, displacement_weights

mmpose/codecs/utils/oks_map.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import product
+from typing import Optional, Tuple, Union
+import numpy as np
+from scipy.spatial.distance import cdist
+def generate_oks_maps(
+    heatmap_size: Tuple[int, int],
+    keypoints: np.ndarray,
+    keypoints_visible: np.ndarray,
+    keypoints_visibility: np.ndarray,
+    sigma: float = 0.55,
+    increase_sigma_with_padding: bool = False,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate gaussian heatmaps of keypoints using `UDP`_.
+    Args:
+        heatmap_size (Tuple[int, int]): Heatmap size in [W, H]
+        keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
+        keypoints_visible (np.ndarray): Keypoint visibilities in shape
+            (N, K)
+        sigma (float): The sigma value of the Gaussian heatmap
+        keypoints_visibility (np.ndarray): The visibility bit for each keypoint (N, K)
+        increase_sigma_with_padding (bool): Whether to increase the sigma
+            value with padding. Default: False
+    Returns:
+        tuple:
+        - heatmaps (np.ndarray): The generated heatmap in shape
+            (K, H, W) where [W, H] is the `heatmap_size`
+        - keypoint_weights (np.ndarray): The target weights in shape
+            (N, K)
+    .. _`UDP`: https://arxiv.org/abs/1911.07524
+    """
+    N, K, _ = keypoints.shape
+    W, H = heatmap_size
+    # The default sigmas are used for COCO dataset.
+    sigmas = np.array(
+        [2.6, 2.5, 2.5, 3.5, 3.5, 7.9, 7.9, 7.2, 7.2, 6.2, 6.2, 10.7, 10.7, 8.7, 8.7, 8.9, 8.9])/100
+    # sigmas = sigmas * 2 / sigmas.mean()
+    # sigmas = np.round(sigmas).astype(int)
+    # sigmas = np.clip(sigmas, 1, 10)
+    heatmaps = np.zeros((K, H, W), dtype=np.float32)
+    keypoint_weights = keypoints_visible.copy()
+    # bbox_area = W/1.25 * H/1.25
+    # bbox_area = W * H * 0.53
+    bbox_area = np.sqrt(H/1.25 * W/1.25)
+    # print(scales_arr)
+    # print(scaled_sigmas)
+    for n, k in product(range(N), range(K)):
+        kpt_sigma = sigmas[k]
+        # skip unlabled keypoints
+        if keypoints_visible[n, k] < 0.5:
+            continue
+        y_idx, x_idx = np.indices((H, W))
+        dx = x_idx - keypoints[n, k, 0]
+        dy = y_idx - keypoints[n, k, 1]
+        dist = np.sqrt(dx**2 + dy**2)
+        # e_map = (dx**2 + dy**2) / ((kpt_sigma*100)**2 * sigma)
+        vars = (kpt_sigma*2)**2
+        s = vars * bbox_area * 2
+        s = np.clip(s, 0.55, 3.0)
+        if sigma is not None and sigma > 0:
+            s = sigma
+        e_map = dist**2 / (2*s)
+        oks_map = np.exp(-e_map)
+        keypoint_weights[n, k] = (oks_map.max() > 0).astype(int)
+        # Scale such that there is always 1 at the maximum
+        if oks_map.max() > 1e-3:
+            oks_map = oks_map / oks_map.max()
+        # Scale OKS map such that 1 stays 1 and 0.5 becomes 0
+        # oks_map[oks_map < 0.5] = 0
+        # oks_map = 2 * oks_map - 1
+        # oks_map[oks_map > 0.95] = 1
+        # print("{:.4f}, {:7.1f}, {:9.3f}, {:9.3f}, {:4.2f}".format(vars, bbox_area, vars * bbox_area* 2, s, oks_map.max()))
+        # if np.all(oks_map < 0.1):
+        #     print("\t{:d} --> {:.4f}".format(k, s))
+        heatmaps[k] = oks_map
+        # breakpoint()
+    return heatmaps, keypoint_weights

mmpose/codecs/utils/post_processing.py ADDED Viewed

	@@ -0,0 +1,530 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import product
+from typing import Tuple
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from scipy.signal import convolve2d
+def get_simcc_normalized(batch_pred_simcc, sigma=None):
+    """Normalize the predicted SimCC.
+    Args:
+        batch_pred_simcc (torch.Tensor): The predicted SimCC.
+        sigma (float): The sigma of the Gaussian distribution.
+    Returns:
+        torch.Tensor: The normalized SimCC.
+    """
+    B, K, _ = batch_pred_simcc.shape
+    # Scale and clamp the tensor
+    if sigma is not None:
+        batch_pred_simcc = batch_pred_simcc / (sigma * np.sqrt(np.pi * 2))
+    batch_pred_simcc = batch_pred_simcc.clamp(min=0)
+    # Compute the binary mask
+    mask = (batch_pred_simcc.amax(dim=-1) > 1).reshape(B, K, 1)
+    # Normalize the tensor using the maximum value
+    norm = (batch_pred_simcc / batch_pred_simcc.amax(dim=-1).reshape(B, K, 1))
+    # Apply normalization
+    batch_pred_simcc = torch.where(mask, norm, batch_pred_simcc)
+    return batch_pred_simcc
+def get_simcc_maximum(simcc_x: np.ndarray,
+                      simcc_y: np.ndarray,
+                      apply_softmax: bool = False
+                      ) -> Tuple[np.ndarray, np.ndarray]:
+    """Get maximum response location and value from simcc representations.
+    Note:
+        instance number: N
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+    Args:
+        simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
+        simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
+        apply_softmax (bool): whether to apply softmax on the heatmap.
+            Defaults to False.
+    Returns:
+        tuple:
+        - locs (np.ndarray): locations of maximum heatmap responses in shape
+            (K, 2) or (N, K, 2)
+        - vals (np.ndarray): values of maximum heatmap responses in shape
+            (K,) or (N, K)
+    """
+    assert isinstance(simcc_x, np.ndarray), ('simcc_x should be numpy.ndarray')
+    assert isinstance(simcc_y, np.ndarray), ('simcc_y should be numpy.ndarray')
+    assert simcc_x.ndim == 2 or simcc_x.ndim == 3, (
+        f'Invalid shape {simcc_x.shape}')
+    assert simcc_y.ndim == 2 or simcc_y.ndim == 3, (
+        f'Invalid shape {simcc_y.shape}')
+    assert simcc_x.ndim == simcc_y.ndim, (
+        f'{simcc_x.shape} != {simcc_y.shape}')
+    if simcc_x.ndim == 3:
+        N, K, Wx = simcc_x.shape
+        simcc_x = simcc_x.reshape(N * K, -1)
+        simcc_y = simcc_y.reshape(N * K, -1)
+    else:
+        N = None
+    if apply_softmax:
+        simcc_x = simcc_x - np.max(simcc_x, axis=1, keepdims=True)
+        simcc_y = simcc_y - np.max(simcc_y, axis=1, keepdims=True)
+        ex, ey = np.exp(simcc_x), np.exp(simcc_y)
+        simcc_x = ex / np.sum(ex, axis=1, keepdims=True)
+        simcc_y = ey / np.sum(ey, axis=1, keepdims=True)
+    x_locs = np.argmax(simcc_x, axis=1)
+    y_locs = np.argmax(simcc_y, axis=1)
+    locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+    max_val_x = np.amax(simcc_x, axis=1)
+    max_val_y = np.amax(simcc_y, axis=1)
+    mask = max_val_x > max_val_y
+    max_val_x[mask] = max_val_y[mask]
+    vals = max_val_x
+    locs[vals <= 0.] = -1
+    if N:
+        locs = locs.reshape(N, K, 2)
+        vals = vals.reshape(N, K)
+    return locs, vals
+def get_heatmap_3d_maximum(heatmaps: np.ndarray
+                           ) -> Tuple[np.ndarray, np.ndarray]:
+    """Get maximum response location and value from heatmaps.
+    Note:
+        batch_size: B
+        num_keypoints: K
+        heatmap dimension: D
+        heatmap height: H
+        heatmap width: W
+    Args:
+        heatmaps (np.ndarray): Heatmaps in shape (K, D, H, W) or
+            (B, K, D, H, W)
+    Returns:
+        tuple:
+        - locs (np.ndarray): locations of maximum heatmap responses in shape
+            (K, 3) or (B, K, 3)
+        - vals (np.ndarray): values of maximum heatmap responses in shape
+            (K,) or (B, K)
+    """
+    assert isinstance(heatmaps,
+                      np.ndarray), ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 4 or heatmaps.ndim == 5, (
+        f'Invalid shape {heatmaps.shape}')
+    if heatmaps.ndim == 4:
+        K, D, H, W = heatmaps.shape
+        B = None
+        heatmaps_flatten = heatmaps.reshape(K, -1)
+    else:
+        B, K, D, H, W = heatmaps.shape
+        heatmaps_flatten = heatmaps.reshape(B * K, -1)
+    z_locs, y_locs, x_locs = np.unravel_index(
+        np.argmax(heatmaps_flatten, axis=1), shape=(D, H, W))
+    locs = np.stack((x_locs, y_locs, z_locs), axis=-1).astype(np.float32)
+    vals = np.amax(heatmaps_flatten, axis=1)
+    locs[vals <= 0.] = -1
+    if B:
+        locs = locs.reshape(B, K, 3)
+        vals = vals.reshape(B, K)
+    return locs, vals
+def get_heatmap_maximum(heatmaps: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Get maximum response location and value from heatmaps.
+    Note:
+        batch_size: B
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+    Args:
+        heatmaps (np.ndarray): Heatmaps in shape (K, H, W) or (B, K, H, W)
+    Returns:
+        tuple:
+        - locs (np.ndarray): locations of maximum heatmap responses in shape
+            (K, 2) or (B, K, 2)
+        - vals (np.ndarray): values of maximum heatmap responses in shape
+            (K,) or (B, K)
+    """
+    assert isinstance(heatmaps,
+                      np.ndarray), ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 3 or heatmaps.ndim == 4, (
+        f'Invalid shape {heatmaps.shape}')
+    if heatmaps.ndim == 3:
+        K, H, W = heatmaps.shape
+        B = None
+        heatmaps_flatten = heatmaps.reshape(K, -1)
+    else:
+        B, K, H, W = heatmaps.shape
+        heatmaps_flatten = heatmaps.reshape(B * K, -1)
+    y_locs, x_locs = np.unravel_index(
+        np.argmax(heatmaps_flatten, axis=1), shape=(H, W))
+    locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+    vals = np.amax(heatmaps_flatten, axis=1)
+    locs[vals <= 0.] = -1
+    if B:
+        locs = locs.reshape(B, K, 2)
+        vals = vals.reshape(B, K)
+    return locs, vals
+def gaussian_blur(heatmaps: np.ndarray, kernel: int = 11) -> np.ndarray:
+    """Modulate heatmap distribution with Gaussian.
+    Note:
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+    Args:
+        heatmaps (np.ndarray[K, H, W]): model predicted heatmaps.
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the heatmap gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+    Returns:
+        np.ndarray ([K, H, W]): Modulated heatmap distribution.
+    """
+    assert kernel % 2 == 1
+    border = (kernel - 1) // 2
+    K, H, W = heatmaps.shape
+    for k in range(K):
+        origin_max = np.max(heatmaps[k])
+        dr = np.zeros((H + 2 * border, W + 2 * border), dtype=np.float32)
+        dr[border:-border, border:-border] = heatmaps[k].copy()
+        dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
+        heatmaps[k] = dr[border:-border, border:-border].copy()
+        heatmaps[k] *= origin_max / (np.max(heatmaps[k])+1e-12)
+    return heatmaps
+def gaussian_blur1d(simcc: np.ndarray, kernel: int = 11) -> np.ndarray:
+    """Modulate simcc distribution with Gaussian.
+    Note:
+        - num_keypoints: K
+        - simcc length: Wx
+    Args:
+        simcc (np.ndarray[K, Wx]): model predicted simcc.
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the simcc gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+    Returns:
+        np.ndarray ([K, Wx]): Modulated simcc distribution.
+    """
+    assert kernel % 2 == 1
+    border = (kernel - 1) // 2
+    N, K, Wx = simcc.shape
+    for n, k in product(range(N), range(K)):
+        origin_max = np.max(simcc[n, k])
+        dr = np.zeros((1, Wx + 2 * border), dtype=np.float32)
+        dr[0, border:-border] = simcc[n, k].copy()
+        dr = cv2.GaussianBlur(dr, (kernel, 1), 0)
+        simcc[n, k] = dr[0, border:-border].copy()
+        simcc[n, k] *= origin_max / np.max(simcc[n, k])
+    return simcc
+def batch_heatmap_nms(batch_heatmaps: Tensor, kernel_size: int = 5):
+    """Apply NMS on a batch of heatmaps.
+    Args:
+        batch_heatmaps (Tensor): batch heatmaps in shape (B, K, H, W)
+        kernel_size (int): The kernel size of the NMS which should be
+            a odd integer. Defaults to 5
+    Returns:
+        Tensor: The batch heatmaps after NMS.
+    """
+    assert isinstance(kernel_size, int) and kernel_size % 2 == 1, \
+        f'The kernel_size should be an odd integer, got {kernel_size}'
+    padding = (kernel_size - 1) // 2
+    maximum = F.max_pool2d(
+        batch_heatmaps, kernel_size, stride=1, padding=padding)
+    maximum_indicator = torch.eq(batch_heatmaps, maximum)
+    batch_heatmaps = batch_heatmaps * maximum_indicator.float()
+    return batch_heatmaps
+def get_heatmap_expected_value(heatmaps: np.ndarray, parzen_size: float = 0.1, return_heatmap: bool = False) -> Tuple[np.ndarray, np.ndarray]:
+    """Get maximum response location and value from heatmaps.
+    Note:
+        batch_size: B
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+    Args:
+        heatmaps (np.ndarray): Heatmaps in shape (K, H, W) or (B, K, H, W)
+    Returns:
+        tuple:
+        - locs (np.ndarray): locations of maximum heatmap responses in shape
+            (K, 2) or (B, K, 2)
+        - vals (np.ndarray): values of maximum heatmap responses in shape
+            (K,) or (B, K)
+    """
+    assert isinstance(heatmaps,
+                      np.ndarray), ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 3 or heatmaps.ndim == 4, (
+        f'Invalid shape {heatmaps.shape}')
+    assert parzen_size >= 0.0 and parzen_size <= 1.0, (
+        f'Invalid parzen_size {parzen_size}')
+    if heatmaps.ndim == 3:
+        K, H, W = heatmaps.shape
+        B = 1
+        FIRST_DIM = K
+        heatmaps_flatten = heatmaps.reshape(1, K, H, W)
+    else:
+        B, K, H, W = heatmaps.shape
+        FIRST_DIM = K*B
+        heatmaps_flatten = heatmaps.reshape(B, K, H, W)
+    # Blur heatmaps with Gaussian
+    # heatmaps_flatten = gaussian_blur(heatmaps_flatten, kernel=9)
+    # Zero out pixels far from the maximum for each heatmap
+    # heatmaps_tmp = heatmaps_flatten.copy().reshape(B*K, H*W)
+    # y_locs, x_locs = np.unravel_index(
+    #     np.argmax(heatmaps_tmp, axis=1), shape=(H, W))
+    # locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+    # heatmaps_flatten = heatmaps_flatten.reshape(B*K, H, W)
+    # for i, x in enumerate(x_locs):
+    #     y = y_locs[i]
+    #     start_x = int(max(0, x - 0.2*W))
+    #     end_x = int(min(W, x + 0.2*W))
+    #     start_y = int(max(0, y - 0.2*H))
+    #     end_y = int(min(H, y + 0.2*H))
+    #     mask = np.zeros((H, W))
+    #     mask[start_y:end_y, start_x:end_x] = 1
+    #     heatmaps_flatten[i] = heatmaps_flatten[i] * mask
+    # heatmaps_flatten = heatmaps_flatten.reshape(B, K, H, W)
+    bbox_area = np.sqrt(H/1.25 * W/1.25)
+    kpt_sigmas = np.array(
+        [2.6, 2.5, 2.5, 3.5, 3.5, 7.9, 7.9, 7.2, 7.2, 6.2, 6.2, 10.7, 10.7, 8.7, 8.7, 8.9, 8.9])/100
+    heatmaps_covolved = np.zeros_like(heatmaps_flatten)
+    for k in range(K):
+        vars = (kpt_sigmas[k]*2)**2
+        s = vars * bbox_area * 2
+        s = np.clip(s, 0.55, 3.0)
+        radius = np.ceil(s * 3).astype(int)
+        diameter = 2*radius + 1
+        diameter = np.ceil(diameter).astype(int)
+        # kernel_sizes[kernel_sizes % 2 == 0] += 1
+        center = diameter // 2
+        dist_x = np.arange(diameter) - center
+        dist_y = np.arange(diameter) - center
+        dist_x, dist_y = np.meshgrid(dist_x, dist_y)
+        dist = np.sqrt(dist_x**2 + dist_y**2)
+        oks_kernel = np.exp(-dist**2 / (2 * s))
+        oks_kernel = oks_kernel / oks_kernel.sum()
+        htm = heatmaps_flatten[:, k, :, :].reshape(-1, H, W)
+        # htm = np.pad(htm, ((0, 0), (radius, radius), (radius, radius)), mode='symmetric')
+        # htm = torch.from_numpy(htm).float()
+        # oks_kernel = torch.from_numpy(oks_kernel).float().to(htm.device).reshape(1, diameter, diameter)
+        oks_kernel = oks_kernel.reshape(1, diameter, diameter)
+        htm_conv = np.zeros_like(htm)
+        for b in range(B):
+            htm_conv[b, :, :] = convolve2d(htm[b, :, :], oks_kernel[b, :, :], mode='same', boundary='symm')
+        # htm_conv = F.conv2d(htm.unsqueeze(1), oks_kernel.unsqueeze(1), padding='same')
+        # htm_conv = htm_conv[:, :, radius:-radius, radius:-radius]
+        htm_conv = htm_conv.reshape(-1, 1, H, W)
+        heatmaps_covolved[:, k, :, :] = htm_conv
+    heatmaps_covolved = heatmaps_covolved.reshape(B*K, H*W)
+    y_locs, x_locs = np.unravel_index(
+        np.argmax(heatmaps_covolved, axis=1), shape=(H, W))
+    locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+    # Apply mean-shift to get sub-pixel locations
+    locs = _get_subpixel_maximums(heatmaps_covolved.reshape(B*K, H, W), locs)
+    # breakpoint()
+    # heatmaps_sums = heatmaps_flatten.sum(axis=(1, 2))
+    # norm_heatmaps = heatmaps_flatten.copy()
+    # norm_heatmaps[heatmaps_sums > 0] = heatmaps_flatten[heatmaps_sums > 0] / heatmaps_sums[heatmaps_sums > 0, None, None]
+    # # Compute Parzen window with Gaussian blur along the edge instead of simple mirroring
+    # x_pad = int(parzen_size * W + 0.5)
+    # y_pad = int(parzen_size * H + 0.5)
+    # # x_pad = 0
+    # # y_pad = 0
+    # kernel_size = int(min(H, W)*parzen_size + 0.5)
+    # if kernel_size % 2 == 0:
+    #     kernel_size += 1
+    # # norm_heatmaps_pad_blur = np.pad(norm_heatmaps, ((0, 0), (x_pad, x_pad), (y_pad, y_pad)), mode='symmetric')
+    # norm_heatmaps_pad = np.pad(norm_heatmaps, ((0, 0), (y_pad, y_pad), (x_pad, x_pad)), mode='constant', constant_values=0)
+    # norm_heatmaps_pad_blur = gaussian_blur(norm_heatmaps_pad, kernel=kernel_size)
+    # # norm_heatmaps_pad_blur[:, x_pad:-x_pad, y_pad:-y_pad] = norm_heatmaps
+    # norm_heatmaps_pad_sum = norm_heatmaps_pad_blur.sum(axis=(1, 2))
+    # norm_heatmaps_pad_blur[norm_heatmaps_pad_sum>0] = norm_heatmaps_pad_blur[norm_heatmaps_pad_sum>0] / norm_heatmaps_pad_sum[norm_heatmaps_pad_sum>0, None, None]
+    # # # Save the blurred heatmaps
+    # # for i in range(heatmaps.shape[0]):
+    # #     tmp_htm = norm_heatmaps_pad_blur[i].copy()
+    # #     tmp_htm = (tmp_htm - tmp_htm.min()) / (tmp_htm.max() - tmp_htm.min())
+    # #     tmp_htm = (tmp_htm*255).astype(np.uint8)
+    # #     tmp_htm = cv2.cvtColor(tmp_htm, cv2.COLOR_GRAY2BGR)
+    # #     tmp_htm = cv2.applyColorMap(tmp_htm, cv2.COLORMAP_JET)
+    # #     tmp_htm2 = norm_heatmaps_pad[i].copy()
+    # #     tmp_htm2 = (tmp_htm2 - tmp_htm2.min()) / (tmp_htm2.max() - tmp_htm2.min())
+    # #     tmp_htm2 = (tmp_htm2*255).astype(np.uint8)
+    # #     tmp_htm2 = cv2.cvtColor(tmp_htm2, cv2.COLOR_GRAY2BGR)
+    # #     tmp_htm2 = cv2.applyColorMap(tmp_htm2, cv2.COLORMAP_JET)
+    # #     tmp_htm = cv2.addWeighted(tmp_htm, 0.5, tmp_htm2, 0.5, 0)
+    # #     cv2.imwrite(f'heatmaps_blurred_{i}.png', tmp_htm)
+    # # norm_heatmaps_pad = np.pad(norm_heatmaps, ((0, 0), (x_pad, x_pad), (y_pad, y_pad)), mode='edge')
+    # y_idx, x_idx = np.indices(norm_heatmaps_pad_blur.shape[1:])
+    # # breakpoint()
+    # x_locs = np.sum(norm_heatmaps_pad_blur * x_idx, axis=(1, 2)) - x_pad
+    # y_locs = np.sum(norm_heatmaps_pad_blur * y_idx, axis=(1, 2)) - y_pad
+    # # mean_idx = np.argmax(heatmaps_flatten, axis=1)
+    # # x_locs, y_locs = np.unravel_index(mean_idx, shape=(H, W))
+    # # locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+    # # breakpoint()
+    # # vals = heatmaps_flatten[np.arange(heatmaps_flatten.shape[0]), mean_idx]
+    # # locs[vals <= 0.] = -1
+    # # mean_idx = np.argmax(norm_heatmaps, axis=1)
+    # # y_locs, x_locs = np.unravel_index(
+    # #     mean_idx, shape=(H, W))
+    # locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+    # # vals = np.amax(heatmaps_flatten, axis=1)
+    x_locs_int = np.round(x_locs).astype(int)
+    x_locs_int = np.clip(x_locs_int, 0, W-1)
+    y_locs_int = np.round(y_locs).astype(int)
+    y_locs_int = np.clip(y_locs_int, 0, H-1)
+    vals = heatmaps_flatten[np.arange(B), np.arange(K), y_locs_int, x_locs_int]
+    # breakpoint()
+    # locs[vals <= 0.] = -1
+    # print(mean_idx)
+    # print(x_locs)
+    # print(y_locs)
+    # print(locs)
+    heatmaps_covolved = heatmaps_covolved.reshape(B, K, H, W)
+    if B > 1:
+        locs = locs.reshape(B, K, 2)
+        vals = vals.reshape(B, K)
+        heatmaps_covolved = heatmaps_covolved.reshape(B, K, H, W)
+    else:
+        locs = locs.reshape(K, 2)
+        vals = vals.reshape(K)
+        heatmaps_covolved = heatmaps_covolved.reshape(K, H, W)
+    if return_heatmap:
+        return locs, vals, heatmaps_covolved
+    else:
+        return locs, vals
+def _get_subpixel_maximums(heatmaps, locs):
+    # Extract integer peak locations
+    x_locs = locs[:, 0].astype(np.int32)
+    y_locs = locs[:, 1].astype(np.int32)
+    # Ensure we are not near the boundaries (avoid boundary issues)
+    valid_mask = (x_locs > 0) & (x_locs < heatmaps.shape[2] - 1) & \
+                 (y_locs > 0) & (y_locs < heatmaps.shape[1] - 1)
+    # Initialize the output array with the integer locations
+    subpixel_locs = locs.copy()
+    if np.any(valid_mask):
+        # Extract valid locations
+        x_locs_valid = x_locs[valid_mask]
+        y_locs_valid = y_locs[valid_mask]
+        # Compute gradients (dx, dy) and second derivatives (dxx, dyy)
+        dx = (heatmaps[valid_mask, y_locs_valid, x_locs_valid + 1] -
+              heatmaps[valid_mask, y_locs_valid, x_locs_valid - 1]) / 2.0
+        dy = (heatmaps[valid_mask, y_locs_valid + 1, x_locs_valid] -
+              heatmaps[valid_mask, y_locs_valid - 1, x_locs_valid]) / 2.0
+        dxx = heatmaps[valid_mask, y_locs_valid, x_locs_valid + 1] + \
+              heatmaps[valid_mask, y_locs_valid, x_locs_valid - 1] - \
+              2 * heatmaps[valid_mask, y_locs_valid, x_locs_valid]
+        dyy = heatmaps[valid_mask, y_locs_valid + 1, x_locs_valid] + \
+              heatmaps[valid_mask, y_locs_valid - 1, x_locs_valid] - \
+              2 * heatmaps[valid_mask, y_locs_valid, x_locs_valid]
+        # Avoid division by zero by setting a minimum threshold for the second derivatives
+        dxx = np.where(dxx != 0, dxx, 1e-6)
+        dyy = np.where(dyy != 0, dyy, 1e-6)
+        # Calculate the sub-pixel shift
+        subpixel_x_shift = -dx / dxx
+        subpixel_y_shift = -dy / dyy
+        # Update subpixel locations for valid indices
+        subpixel_locs[valid_mask, 0] += subpixel_x_shift
+        subpixel_locs[valid_mask, 1] += subpixel_y_shift
+    return subpixel_locs