diff --git a/.gitmodules b/.gitmodules index 773fbae..847caa9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,4 +6,4 @@ url = https://github.com/adishavit/argh [submodule "lib/kProcessor"] path = lib/kProcessor - url = https://github.com/dib-lab/kProcessor.git + url = git@github.com:dib-lab/kProcessor.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 8793d2e..fb7638d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,8 +11,8 @@ project ( VERSION 2.0.0 ) -set(default_build_type "Release") -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -lstdc++fs -fPIC -lgomp -lrt -fopenmp -O3 -Ofast") +set(default_build_type "RELEASE") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -lstdc++fs -fPIC -lgomp -lrt -fopenmp -Ofast") # ---------------------------------------------------------------------------- # kProcessor Setup # ---------------------------------------------------------------------------- @@ -31,13 +31,13 @@ include_directories(${kProcessor_INCLUDE_PATH}) include_directories("${PROJECT_SOURCE_DIR}/include") include_directories("${PROJECT_SOURCE_DIR}/lib/argh") -include_directories("${PROJECT_SOURCE_DIR}/lib/json_parser") +include_directories("${PROJECT_SOURCE_DIR}/lib/json_parser/lib/include/") add_subdirectory("lib/zstr") set(PHMAP_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/lib/kProcessor/ThirdParty/kmerDecoder/lib/parallel-hashmap") include_directories("${PHMAP_INCLUDE_DIRS}") -add_library(kSpider STATIC src/pairwise.cpp src/index.cpp src/fastx_to_kf.cpp src/sourmash_indexing.cpp ${PROJECT_SOURCE_DIR}/lib/kProcessor/include/kProcessor) +add_library(kSpider STATIC src/pairwise.cpp src/index.cpp src/fastx_to_kf.cpp src/sourmash_indexing.cpp src/bins_indexing.cpp ${PROJECT_SOURCE_DIR}/lib/kProcessor/include/kProcessor) set_target_properties(kSpider PROPERTIES POSITION_INDEPENDENT_CODE 1 CXX_STANDARD 17) target_link_libraries (kSpider kProcessor z) target_link_libraries (kSpider kProcessor z zstr::zstr) @@ -49,6 +49,30 @@ target_include_directories(kSpider INTERFACE ${PHMAP_INCLUDE_DIRS} ${PROJECT_SOU add_executable(pairwise pairwise.cpp) target_link_libraries(pairwise kSpider kProcessor z) +add_executable(index_bins bins.cpp) +target_link_libraries(index_bins kSpider kProcessor z) + +add_executable(validate validate.cpp) +target_link_libraries(validate kSpider kProcessor z) + +add_executable(dump_bin export_bin.cpp) +target_link_libraries(dump_bin kSpider kProcessor z) + +add_executable(check_bin check_bin.cpp) +target_link_libraries(check_bin kSpider kProcessor z) + +add_executable(dump_sig export_sig.cpp) +target_link_libraries(dump_sig kSpider kProcessor z) + +add_executable(sigs_to_bins sigs_to_bins.cpp) +target_link_libraries(sigs_to_bins kSpider kProcessor z) + +add_executable(sig_to_bin sig_to_bin.cpp) +target_link_libraries(sig_to_bin kSpider kProcessor z zstr::zstr) + +add_executable(repr_sketches apps/repr_sketches.cpp) +target_link_libraries(repr_sketches z kProcessor) + # add_executable(index_kframes apps/index_kframes.cpp) # target_link_libraries(index_kframes kSpider kProcessor z) diff --git a/apps/repr_sketches.cpp b/apps/repr_sketches.cpp new file mode 100644 index 0000000..2e048ab --- /dev/null +++ b/apps/repr_sketches.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace boost::algorithm; +using namespace std; + +bool comp(pair a, pair b) { + return a.second > b.second; +} + + +int main(int argc, char** argv) { + ifstream fin(argv[1]); + phmap::flat_hash_map count; + string line; + getline(fin, line); // skip header. + while (getline(fin, line)) { + // Split line into tab-separated parts + vector parts; + split(parts, line, boost::is_any_of("\t")); + float containment = stof(parts[4]); + if (containment > 0.20) { + uint64_t from_node = stoi(parts[0]); + uint64_t to_node = stoi(parts[1]); + count[from_node]++; + count[to_node]++; + } + + } + fin.close(); + + std::vector> elems(count.begin(), count.end()); + std::sort(elems.begin(), elems.end(), comp); + + for (auto& [k, v] : elems) { + cout << k << ": " << v << endl; + } +} \ No newline at end of file diff --git a/bins.cpp b/bins.cpp new file mode 100644 index 0000000..e8fc8d9 --- /dev/null +++ b/bins.cpp @@ -0,0 +1,26 @@ +#include "kSpider.hpp" + +inline uint64_t to_uint64_t(std::string const& value) { + uint64_t result = 0; + char const* p = value.c_str(); + char const* q = p + value.size(); + while (p < q) { + result *= 10; + result += *(p++) - '0'; + } + return result; +} + +int main(int argc, char** argv) { + if(argc < 6){ + cout << "args: \n"; + exit(1); + } + string bins_dir = argv[1]; + int kSize = stoi(argv[2]); + string output_prefix = argv[3]; + uint64_t reserve_size = to_uint64_t(argv[4]); + uint64_t legend_reserve = to_uint64_t(argv[5]); + + kSpider::bins_indexing(bins_dir, kSize, output_prefix, reserve_size, legend_reserve); +} \ No newline at end of file diff --git a/check_bin.cpp b/check_bin.cpp new file mode 100644 index 0000000..7159add --- /dev/null +++ b/check_bin.cpp @@ -0,0 +1,31 @@ +#include +#include +#include +#include "parallel_hashmap/phmap.h" +#include +#include +#include +#include +#include +#include "parallel_hashmap/phmap_dump.h" +#include + +using namespace std; +// using namespace phmap; + + +int main(int argc, char** argv) { + + if (argc != 2) { + cout << "run: ./check_bin " << endl; + exit(1); + } + + string bin_path = argv[1]; + phmap::flat_hash_set table_in; + phmap::BinaryInputArchive ar_in(bin_path.c_str()); + table_in.phmap_load(ar_in); + + + cout << "VALID_BIN: " << table_in.size(); +} \ No newline at end of file diff --git a/export_bin.cpp b/export_bin.cpp new file mode 100644 index 0000000..cb5ab08 --- /dev/null +++ b/export_bin.cpp @@ -0,0 +1,33 @@ +#include +#include +#include +#include "parallel_hashmap/phmap.h" +#include +#include +#include +#include +#include +#include "parallel_hashmap/phmap_dump.h" +#include + +using namespace std; +// using namespace phmap; + + +int main(int argc, char** argv) { + + if (argc != 2) { + cout << "run: ./dump_bin " << endl; + exit(1); + } + + string bin_path = argv[1]; + + phmap::flat_hash_set table_in; + phmap::BinaryInputArchive ar_in(bin_path.c_str()); + table_in.phmap_load(ar_in); + cerr << "loaded bin size: " << table_in.size() << endl; + + for(const uint64_t & hash : table_in) cout << hash << endl; + +} \ No newline at end of file diff --git a/export_sig.cpp b/export_sig.cpp new file mode 100644 index 0000000..65517a7 --- /dev/null +++ b/export_sig.cpp @@ -0,0 +1,55 @@ +#include +#include +#include +#include "parallel_hashmap/phmap.h" +#include +#include +#include "cpp-json/json.h" +#include "zstr.hpp" +#include +#include +#include +#include "parallel_hashmap/phmap_dump.h" +#include + +using namespace std; +// using namespace phmap; + +typedef std::chrono::high_resolution_clock Time; + + +int main(int argc, char** argv) { + + if (argc != 3) { + cout << "run: ./dump_sig " << endl; + exit(1); + } + + string sig_path = argv[1]; + int kSize = stoi(argv[2]); + + phmap::flat_hash_set tmp_hashes; + + auto begin_time = Time::now(); + zstr::ifstream sig_stream(sig_path); + json::value json = json::parse(sig_stream); + auto sourmash_sig = json[0]["signatures"]; + const json::array& sig_array = as_array(sourmash_sig); + for (auto it = sig_array.begin(); it != sig_array.end(); ++it) { + const json::value& v = *it; + if (v["ksize"] == kSize) { + const json::array& mins = as_array(v["mins"]); + auto mins_it = mins.begin(); + while (mins_it != mins.end()) { + tmp_hashes.insert(json::to_number(*mins_it)); + mins_it++; + } + } + break; + } + + + for (const uint64_t& hash : tmp_hashes) cout << hash << endl; + + +} \ No newline at end of file diff --git a/include/kSpider.hpp b/include/kSpider.hpp index 416cdcd..8572009 100644 --- a/include/kSpider.hpp +++ b/include/kSpider.hpp @@ -16,6 +16,7 @@ namespace kSpider { void index_dayhoff(int kSize, string fasta_file, string names_file, int chunk_size, string index_prefix); void index_datasets(string kfs_dir); void sourmash_sigs_indexing(string sigs_dir, int kSize); + void bins_indexing(string bins_dir, int selective_kSize, string output_prefix, uint64_t kmers_reserve, uint64_t colors_reserve); void paired_end_to_kDataFrame(string r1_file_name, string r2_file_name, int kSize, int chunk_size, int downsampling_ratio, bool remove_singletones); void single_end_to_kDataFrame(string r1_file_name, int kSize, int chunk_size, int downsampling_ration, bool remove_singletones); void protein_to_kDataFrame(string r1_file_name, int kSize, int chunk_size, bool is_dayhoff, string output_prefix, int downsampling_ration = 1); diff --git a/lib/json_parser/CMakeLists.txt b/lib/json_parser/CMakeLists.txt new file mode 100644 index 0000000..2e2c8d2 --- /dev/null +++ b/lib/json_parser/CMakeLists.txt @@ -0,0 +1,5 @@ +cmake_minimum_required (VERSION 3.0) +project(cpp-json CXX) + +add_subdirectory(lib) +add_subdirectory(test) diff --git a/lib/json_parser/COPYING b/lib/json_parser/COPYING new file mode 100644 index 0000000..8cdb845 --- /dev/null +++ b/lib/json_parser/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + {description} + Copyright (C) {year} {fullname} + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + {signature of Ty Coon}, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. + diff --git a/lib/json_parser/LICENSE b/lib/json_parser/LICENSE new file mode 100644 index 0000000..d39b556 --- /dev/null +++ b/lib/json_parser/LICENSE @@ -0,0 +1,17 @@ +cpp-json +Copyright (C) 2014-2015 Evan Teran + evan.teran@gmail.com + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. diff --git a/lib/json_parser/README.md b/lib/json_parser/README.md new file mode 100644 index 0000000..e7e9485 --- /dev/null +++ b/lib/json_parser/README.md @@ -0,0 +1,88 @@ +**cpp-json is licensed under the GNU General Public License, version 2 or later.** + +**NOTE:** version [2.2](https://github.com/eteran/cpp-json/releases/tag/2.2) will be the last to not require C++11. + +There are a few different JSON parsing libraries out there. But cpp-json aims to be the simplest to use while still being efficient by using modern c++ techniques. Additionally, this library is header only making it trivial to include in existing projects. + +Currently, the only active item on the TODO list is to better support Unicode. The parser can be given iterators referring to wide characters, but there is no runtime detection or endian-ness detection (yet) + +However, Unicode is generally **well supported** in the form of `\uXXXX` encoding, including code points which require surrogate pairs. The resulting `json::value` object will contain the string, encoded as UTF-8 since it is stored in a `std::string`. + +Of course special consideration is needed when displaying these strings if they do in fact contain non ASCII characters. + +So, for example, + +```json +{"test1" : "\uD840\uDC8A"} +``` + +will correctly parse and the object's "test1" member will have the byte sequence: `0xF0 0xA0 0x82 0x8A` + +Here is a simple example of the usage of this library: + +```c++ +#include "cpp-json/json.h" +#include +#include + +int main() { + // open a file + std::ifstream file("test.json"); + + // json::parse can take two iterators or a std::istream + json::value json = json::parse(file); + + // you can access objects like associative array's easily + // the result is a json::value + // ... though in real code you may want to check the type first ;-) + auto servlets = json["web-app"]["servlet"]; + + // when dealing with arrays, you can just use iterators, + // or feel free to use C++11 ranged-for + const json::array &a = as_array(servlets); + for(auto it = a.begin(); it != a.end(); ++it) { + const json::value &v = *it; + // all basic types (numbers, strings, booleans) can be converted + // to a string + std::cout << to_string(v["servlet-name"]) << std::endl; + } +} +``` + +You can also programmatically create `json::value` objects like this: + +```c++ +int main(int argc, char *argv[]) { + auto arr = json::array { + 1, + 2, + 3, + 4, + "Testing 1 2 3", + json::object{ + { "hello", 1234 }, + { "world", 5678 } + } + }; + + std::cout << stringify(arr) << std::endl; +} +``` + +Which of course results in a object representing the following JSON: + +```json +[ + 1, + 2, + 3, + 4, + "Testing 1 2 3", + { + "hello" : 1234, + "world" : 5678 + } +] +``` + +Finally, this library is very fast, when processing a 190 MB JSON file I randomly selected, parsing took no more than 18 seconds on my machine. For a Qt4 JSON parsing library, you can also checkout my other project: [QJson4](https://github.com/eteran/qjson4) diff --git a/lib/json_parser/RSJparser.tcc b/lib/json_parser/RSJparser.tcc deleted file mode 100644 index 41f6833..0000000 --- a/lib/json_parser/RSJparser.tcc +++ /dev/null @@ -1,791 +0,0 @@ -/** ************************************************************************************** -* * -* A Ridiculously Simple JSON Parser for C++ (RSJp-cpp) * -* Version 2.x * -* ---------------------------------------------------------- * -* Copyright (C) 2018 Subhrajit Bhattacharya * -* * -* This program is free software: you can redistribute it and/or modify * -* it under the terms of the GNU General Public License as published by * -* the Free Software Foundation, either version 3 of the License, or * -* (at your option) any later version. * -* * -* This program is distributed in the hope that it will be useful, * -* but WITHOUT ANY WARRANTY; without even the implied warranty of * -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * -* GNU General Public License for more details . * -* * -* * -* Contact: subhrajit@gmail.com * -* https://www.lehigh.edu/~sub216/ , http://subhrajit.net/ * -* * -* * -*************************************************************************************** **/ - -#ifndef __DOSL_RSJPARSE_TCC -#define __DOSL_RSJPARSE_TCC - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "zstr.hpp" - -static char const* RSJobjectbrackets = "{}"; -static char const* RSJarraybrackets = "[]"; -static char RSJobjectassignment = ':'; -static char RSJarraydelimiter = ','; - -static std::vector RSJbrackets = {RSJobjectbrackets, RSJarraybrackets}; -static std::vector RSJstringquotes = {"\"\"", "''"}; -static char RSJcharescape = '\\'; -static std::string RSJlinecommentstart = "//"; - -static std::string RSJprinttab = " "; - -enum RSJresourceType { RSJ_UNINITIATED, RSJ_UNKNOWN, RSJ_OBJECT, RSJ_ARRAY, RSJ_LEAF }; - -// ============================================================ -// Direct string manipulation functions - -inline -std::string to_string (RSJresourceType rt) { - switch (rt) { - case RSJ_UNINITIATED: return("RSJ_UNINITIATED"); - case RSJ_UNKNOWN: return("RSJ_UNKNOWN"); - case RSJ_OBJECT: return("RSJ_OBJECT"); - case RSJ_ARRAY: return("RSJ_ARRAY"); - case RSJ_LEAF: return("RSJ_LEAF"); - } -} - -enum StrTrimDir { STRTRIM_L=1, STRTRIM_R=2, STRTRIM_LR=3 }; - -inline -std::string strtrim (std::string str, std::string chars=" \t\n\r", int max_count=-1, StrTrimDir dirs=STRTRIM_LR) { - if (str.empty()) return(str); - if (max_count<0) max_count = str.length(); - - if (dirs & STRTRIM_L) { // left trim - int p; - for (p=0; p& bracks, int indx=0) { - for (int b=0; b split_RSJ_array (const std::string& str) { // TODO: Make efficient. This function is speed bottleneck. - // splits, while respecting brackets and escapes - std::vector ret; - - std::string current; - std::vector bracket_stack; - std::vector quote_stack; - bool escape_active = false; - int bi; - - for (int a=0; a 0) { // already inside string - if (str[a]==RSJcharescape) // an escape character - escape_active = !escape_active; - else if (!escape_active && str[a]==RSJstringquotes[quote_stack.back()][1] ) { // close quote - quote_stack.pop_back(); - escape_active = false; - } - else - escape_active = false; - - current.push_back (str[a]); - continue; // to * - } - - if (quote_stack.size()==0) { // check for start of string - if ((bi = is_bracket (str[a], RSJstringquotes)) >= 0) { - quote_stack.push_back (bi); - current.push_back (str[a]); - continue; // to * - } - } - - // ------------------------------------ - // checks for comments - - if (quote_stack.size()==0) { // comment cannot start inside string - - // single-line commenst - if (str.compare (a, RSJlinecommentstart.length(), RSJlinecommentstart) == 0) { - // ignore until end of line - int newline_pos = str.find ("\n", a); - if (newline_pos == std::string::npos) - newline_pos = str.find ("\r", a); - - if (newline_pos != std::string::npos) - a = newline_pos; // point to the newline character (a will be incremented) - else // the comment continues until EOF - a = str.length(); - continue; - } - } - - // ------------------------------------ - // checks for brackets - - if ( bracket_stack.size()>0 && str[a]==RSJbrackets[bracket_stack.back()][1] ) { // check for closing bracket - bracket_stack.pop_back(); - current.push_back (str[a]); - continue; - } - - if ((bi = is_bracket (str[a], RSJbrackets)) >= 0) { - bracket_stack.push_back (bi); - current.push_back (str[a]); - continue; // to * - } - - // ------------------------------------ - // otherwise - current.push_back (str[a]); - } - - if (current.length() > 0) - ret.push_back (current); - - return (ret); -} - -inline -std::string insert_tab_after_newlines (std::string str) { - for (int a=0; a RSJobject; -typedef std::vector RSJarray; - -// ------------------------------------ -// Main classes - -class RSJresource { -/* Use: RSJresource("RSJ_string_data").as()["keyName"].as()[2].as() - RSJresource("RSJ_string_data")["keyName"][2].as() */ -private: - // main data - std::string data; // can be object, vector or leaf data - bool _exists; // whether the RSJ resource exists. - - // parsed data - RSJparsedData* parsed_data_p; - -public: - // constructor - RSJresource () : _exists (false), parsed_data_p (NULL) { } // no data field. - - RSJresource (std::string str) : data (str), _exists (true), parsed_data_p (NULL) { } - RSJresource (const char* str) : RSJresource(std::string(str)) { } - - // other convertion - template - RSJresource (dataType d) : RSJresource(std::to_string(d)) { } - - // read from file and stream - RSJresource (std::istream& is) : _exists (true), parsed_data_p (NULL) { - data = std::string ( (std::istreambuf_iterator(is)), (std::istreambuf_iterator()) ); - } - RSJresource (std::ifstream& ifs) : _exists (true), parsed_data_p (NULL) { - std::istream& is = ifs; - data = std::string ( (std::istreambuf_iterator(is)), (std::istreambuf_iterator()) ); - } - - RSJresource (zstr::ifstream& ifs) : _exists (true), parsed_data_p (NULL) { - data = std::string((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); - } - - - // free allocated memory for parsed data - ~RSJresource(); - - // deep copy - RSJresource (const RSJresource& r); - RSJresource& operator= (const RSJresource& r); - - // ------------------------------------ - // parsers (old) - RSJresourceType parse (bool force=false); - void parse_full (bool force=false, int max_depth=INT_MAX, int* parse_count_for_verbose_p=NULL); // recursively parse the entire JSON text - // parser (new) - void fast_parse (std::string* str_p=NULL, bool copy_string=false, int max_depth=INT_MAX, int* parse_start_str_pos=NULL); // TODO: finish. - - RSJobject& as_object (bool force=false); - RSJarray& as_array (bool force=false); - - // ------------------------------------ - - // access raw data and other attributes - int size(void); - std::string& raw_data (void) { return (data); } - bool exists (void) { return (_exists); } - bool is_parsed (void) { return (parsed_data_p!=NULL); } - RSJresourceType type (void); - // emitter - std::string as_str (bool print_comments=false, bool update_data=true); - void print (bool print_comments=false, bool update_data=true) - { std::cout << as_str(print_comments,update_data) << std::endl; } - - // opertor[] - RSJresource& operator[] (std::string key); // object - RSJresource& operator[] (int indx); // array - - // ------------------------------------ - - // as - template - dataType as (const dataType& def = dataType()) { // specialized outside class declaration - if (!exists()) return (def); - return dataType (data); // default behavior for unknown types: invoke 'dataType(std::string)' - } - - // as_vector - template > // vectorType should have push_back method - vectorType as_vector (const vectorType& def = vectorType()); - - // as_map - template > // mapType should have operator[] defined - mapType as_map (const mapType& def = mapType()); -}; - -// ------------------------------------------------------------ - -class RSJparsedData { -public: - RSJobject object; - RSJarray array; - - RSJresourceType type; - RSJparsedData() : type(RSJ_UNKNOWN) {} - - // parser (single-level) - void parse (const std::string& data, RSJresourceType typ = RSJ_UNKNOWN) { - std::string content = strtrim(data); - - if (typ==RSJ_OBJECT || typ==RSJ_UNKNOWN) { - // parse as object: - content = strtrim (strtrim (content, "{", 1, STRTRIM_L ), "}", 1, STRTRIM_R ); - if (content.length() != data.length()) { // a valid object - std::vector nvPairs = split_RSJ_array (content); - for (int a=0; a 0) { - type = RSJ_OBJECT; - return; - } - } - } - - if (typ==RSJ_ARRAY || typ==RSJ_UNKNOWN) { - // parse as array - content = strtrim (strtrim (content, "[", 1, STRTRIM_L ), "]", 1, STRTRIM_R ); - if (content.length() != data.length()) { // a valid array - std::vector nvPairs = split_RSJ_array (content); - for (int a=0; a 0) { - type = RSJ_ARRAY; - return; - } - } - } - - if (typ==RSJ_UNKNOWN) - type = RSJ_LEAF; - } - - - // remove non-existing items inserted due to accessing - int cleanup(void) { - - if (type==RSJ_OBJECT) { - bool found = true; - while (found) { - found = false; - for (auto it=object.begin(); it!=object.end(); ++it) - if (!(it->second.exists())) { - object.erase(it); - found = true; - break; // break for loop since it is now invalid - } - } - return (object.size()); - } - - if (type==RSJ_ARRAY) { // erases only the non-existent elements at the tail - while (!(array[array.size()-1].exists())) - array.pop_back(); - return (array.size()); - } - - if (type==RSJ_LEAF) - return (1); - - return (0); - } - - // size - int size(void) { return (cleanup()); } -}; - - -// ------------------------------------------------------------ -// RSJresource member functions - -inline -RSJresource::~RSJresource (){ - if (parsed_data_p) delete parsed_data_p; -} - -inline -RSJresource::RSJresource (const RSJresource& r) { - data=r.data; - _exists = r._exists; - if(r.parsed_data_p) parsed_data_p = new RSJparsedData(*(r.parsed_data_p)); - else parsed_data_p = NULL; -} - -inline -RSJresource& RSJresource::operator= (const RSJresource& r) { - data=r.data; - _exists = r._exists; - if(r.parsed_data_p) parsed_data_p = new RSJparsedData(*(r.parsed_data_p)); - else parsed_data_p = NULL; - return *this; -} - -inline -int RSJresource::size (void) { - if (!exists()) return (0); - parse(); // parse if not parsed - return (parsed_data_p->size()); -} - -inline -RSJresourceType RSJresource::type (void) { - if (!exists()) return (RSJ_UNINITIATED); - parse(); // parse if not parsed - return (parsed_data_p->type); -} - -inline -std::string RSJresource::as_str (bool print_comments, bool update_data) { - if (exists()) { - std::string ret; - parse(); // parse if not parsed - parsed_data_p->cleanup(); - - if (parsed_data_p->type==RSJ_OBJECT) { - ret = "{\n"; - for (auto it=parsed_data_p->object.begin(); it!=parsed_data_p->object.end(); ++it) { - ret += RSJprinttab + "'" + it->first + "': " + insert_tab_after_newlines( it->second.as_str (print_comments, update_data) ); - if (std::next(it) != parsed_data_p->object.end()) ret += ","; - if (print_comments) - ret += " // " + to_string(it->second.type()); - ret += "\n"; - } - ret += "}"; - } - else if (parsed_data_p->type==RSJ_ARRAY) { - ret = "[\n"; - for (auto it=parsed_data_p->array.begin(); it!=parsed_data_p->array.end(); ++it) { - ret += RSJprinttab + insert_tab_after_newlines( it->as_str (print_comments, update_data) ); - if (std::next(it) != parsed_data_p->array.end()) ret += ","; - if (print_comments) - ret += " // " + to_string(it->type()); - ret += "\n"; - } - ret += "]"; - } - else // RSJ_LEAF or RSJ_UNKNOWN - ret = strtrim (data); - - if (update_data) data = ret; - return (ret); - } - else - return (""); -} - -// Parsers - -inline -RSJresourceType RSJresource::parse (bool force) { - if (!parsed_data_p) parsed_data_p = new RSJparsedData; - if (parsed_data_p->type==RSJ_UNKNOWN || force) parsed_data_p->parse (data, RSJ_UNKNOWN); - return (parsed_data_p->type); -} - -inline -void RSJresource::parse_full (bool force, int max_depth, int* parse_count_for_verbose_p) { // recursive parsing (slow) - if (max_depth==0) return; - if (!parsed_data_p) parsed_data_p = new RSJparsedData; - if (parsed_data_p->type==RSJ_UNKNOWN || force) parsed_data_p->parse (data, RSJ_UNKNOWN); - // verbose - if (parse_count_for_verbose_p) { - (*parse_count_for_verbose_p)++; - if ( (*parse_count_for_verbose_p) % 100 == 0) - std::cout << "parse_full: " << (*parse_count_for_verbose_p) << " calls." << std::endl; - } - // recursive parse children if not already parsed - if (parsed_data_p->type==RSJ_OBJECT) - for (auto it=parsed_data_p->object.begin(); it!=parsed_data_p->object.end(); ++it) - it->second.parse_full (force, max_depth-1, parse_count_for_verbose_p); - else if (parsed_data_p->type==RSJ_ARRAY) - for (auto it=parsed_data_p->array.begin(); it!=parsed_data_p->array.end(); ++it) - it->parse_full (force, max_depth-1, parse_count_for_verbose_p); -} - -// ------------------------------------------------------------ -// ============================================================ -// FAST PARSER (Under construction. DO NOT use the following functions in your application.) - -inline -int seek_next (std::string* str_p, int start_pos, char character) { - -} - -inline -void RSJresource::fast_parse (std::string* str_p, bool copy_string, int max_depth, int* parse_start_str_pos) { - // TODO: UNDER CONSTRUCTION... - - if (!str_p) - str_p = &data; - std::string& str = *str_p; - - // splits, while respecting brackets and escapes - //std::vector ret; - - //std::string current; - std::vector bracket_stack; - std::vector quote_stack; - bool escape_active = false; - int bi; - - bool initial_whitespaces = true; - bool isroot = false; - - if (!parse_start_str_pos) { - parse_start_str_pos = new int; - *parse_start_str_pos = 0; - isroot = true; - } - - int a = *parse_start_str_pos; - - while (*parse_start_str_pos < str_p->length()) { // * - - // initial whitespace characters - if (initial_whitespaces) { - if (str[a] == ' ' || str[a] == '\n' || str[a] == '\r' || str[a] == '\t' ) { - ++a; - continue; - } - else { - if (str[a] == '{') // start of object - // ... TODO: seek_next ':' - - initial_whitespaces = false; - } - } - - - // delimiter - if ( bracket_stack.size()==0 && quote_stack.size()==0 && str[a]==RSJarraydelimiter ) { - //ret.push_back (current); - - //current.clear(); - bracket_stack.clear(); quote_stack.clear(); escape_active = false; - continue; // to * - } - - // ------------------------------------ - // checks for string - - if (quote_stack.size() > 0) { // already inside string - if (str[a]==RSJcharescape) // an escape character - escape_active = !escape_active; - else if (!escape_active && str[a]==RSJstringquotes[quote_stack.back()][1] ) { // close quote - quote_stack.pop_back(); - escape_active = false; - } - else - escape_active = false; - - //current.push_back (str[a]); - continue; // to * - } - - if (quote_stack.size()==0) { // check for start of string - if ((bi = is_bracket (str[a], RSJstringquotes)) >= 0) { - quote_stack.push_back (bi); - //current.push_back (str[a]); - continue; // to * - } - } - - // ------------------------------------ - // checks for comments - - if (quote_stack.size()==0) { // comment cannot start inside string - - // single-line commenst - if (str.compare (a, RSJlinecommentstart.length(), RSJlinecommentstart) == 0) { - // ignore until end of line - int newline_pos = str.find ("\n", a); - if (newline_pos == std::string::npos) - newline_pos = str.find ("\r", a); - - if (newline_pos != std::string::npos) - a = newline_pos; // point to the newline character (a will be incremented) - else // the comment continues until EOF - a = str.length(); - continue; - } - } - - // ------------------------------------ - // checks for brackets - - if ( bracket_stack.size()>0 && str[a]==RSJbrackets[bracket_stack.back()][1] ) { // check for closing bracket - bracket_stack.pop_back(); - //current.push_back (str[a]); - continue; - } - - if ((bi = is_bracket (str[a], RSJbrackets)) >= 0) { - bracket_stack.push_back (bi); - //current.push_back (str[a]); - continue; // to * - } - - // ------------------------------------ - // otherwise - //current.push_back (str[a]); - } - - /*if (current.length() > 0) - ret.push_back (current); */ - - if (isroot) - delete parse_start_str_pos; - - // return (ret); -} - -// ============================================================ - -// ------------------------------------------------------------ - -inline -RSJobject& RSJresource::as_object (bool force) { - if (!parsed_data_p) parsed_data_p = new RSJparsedData; - if (parsed_data_p->type==RSJ_UNKNOWN || force) parsed_data_p->parse (data, RSJ_OBJECT); - return (parsed_data_p->object); -} - -inline -RSJresource& RSJresource::operator[] (std::string key) { // returns reference - return ( (as_object())[key] ); // will return empty resource (with _exists==false) if - // either this resource does not exist, is not an object, or the key does not exist -} - -inline -RSJarray& RSJresource::as_array (bool force) { - if (!parsed_data_p) parsed_data_p = new RSJparsedData; - if (parsed_data_p->type==RSJ_UNKNOWN || force) parsed_data_p->parse (data, RSJ_ARRAY); - return (parsed_data_p->array); -} - -inline -RSJresource& RSJresource::operator[] (int indx) { // returns reference - as_array(); - if (indx >= parsed_data_p->array.size()) - parsed_data_p->array.resize(indx+1); // insert empty resources - return (parsed_data_p->array[indx]); // will return empty resource (with _exists==false) if - // either this resource does not exist, is not an object, or the key does not exist -} - -// ------------------------------------------------------------ -// special 'as': - -template inline -vectorType RSJresource::as_vector (const vectorType& def) { // returns copy -- for being consistent with other 'as' specializations - if (!exists()) return (def); - vectorType ret; - as_array(); - for (auto it=parsed_data_p->array.begin(); it!=parsed_data_p->array.end(); ++it) - ret.push_back (it->as()); - return (ret); -} - -template inline -mapType RSJresource::as_map (const mapType& def) { // returns copy -- for being consistent with other 'as' specializations - if (!exists()) return (def); - mapType ret; - as_object(); - for (auto it=parsed_data_p->object.begin(); it!=parsed_data_p->object.end(); ++it) - ret[it->first] = it->second.as(); - return (ret); -} - -// ============================================================ -// Specialized .as() member functions - -// Helper preprocessor directives -#define rsjObject as() -#define rsjArray as() -#define rsjAs(t) as() - - -// RSJobject -template <> inline -RSJobject RSJresource::as (const RSJobject& def) { // returns copy -- for being consistent with other 'as' specializations - if (!exists()) return (def); - return (as_object()); -} - -// RSJarray -template <> inline -RSJarray RSJresource::as (const RSJarray& def) { // returns copy -- for being consistent with other 'as' specializations - if (!exists()) return (def); - return (as_array()); -} - -// ------------------------------------ -// Elementary types - -// String -template <> inline -std::string RSJresource::as (const std::string& def) { - if (!exists()) return (def); - - char qq = '\0'; - std::string ret = strip_outer_quotes (data, &qq); - - std::vector< std::vector > escapes = { {"\\n","\n"}, {"\\r","\r"}, {"\\t","\t"}, {"\\\\","\\"} }; - if (qq=='"') - escapes.push_back ({"\\\"","\""}); - else if (qq=='\'') - escapes.push_back ({"\\'","'"}); - - for (int a=0; a inline -int RSJresource::as (const int& def) { - if (!exists()) return (def); - return (atoi (strip_outer_quotes(data).c_str() ) ); -} - -// double -template <> inline -double RSJresource::as (const double& def) { - if (!exists()) return (def); - return (atof (strip_outer_quotes(data).c_str() ) ); -} - -// bool -template <> inline -bool RSJresource::as (const bool& def) { - if (!exists()) return (def); - std::string cleanData = strip_outer_quotes (data); - if (cleanData=="true" || cleanData=="TRUE" || cleanData=="True" || atoi(cleanData.c_str())!=0) return (true); - return (false); -} - -// ------------------------------------ -// Other types - -/*template <> template inline -bool RSJresource::as< std::vector > (const std::vector& def) { - return as_vector (def); -} - -template <> template inline -std::unordered_map RSJresource::as< std::unordered_map > - (const std::unordered_map& def) { - return as_map (def); -}*/ - -#endif diff --git a/lib/json_parser/RSJparser.tcc.hpp b/lib/json_parser/RSJparser.tcc.hpp deleted file mode 100644 index 6acb97d..0000000 --- a/lib/json_parser/RSJparser.tcc.hpp +++ /dev/null @@ -1,790 +0,0 @@ -/** ************************************************************************************** -* * -* A Ridiculously Simple JSON Parser for C++ (RSJp-cpp) * -* Version 2.x * -* ---------------------------------------------------------- * -* Copyright (C) 2018 Subhrajit Bhattacharya * -* * -* This program is free software: you can redistribute it and/or modify * -* it under the terms of the GNU General Public License as published by * -* the Free Software Foundation, either version 3 of the License, or * -* (at your option) any later version. * -* * -* This program is distributed in the hope that it will be useful, * -* but WITHOUT ANY WARRANTY; without even the implied warranty of * -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * -* GNU General Public License for more details . * -* * -* * -* Contact: subhrajit@gmail.com * -* https://www.lehigh.edu/~sub216/ , http://subhrajit.net/ * -* * -* * -*************************************************************************************** **/ - -#ifndef __DOSL_RSJPARSE_TCC -#define __DOSL_RSJPARSE_TCC - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "zstr.hpp" - -static char const* RSJobjectbrackets = "{}"; -static char const* RSJarraybrackets = "[]"; -static char RSJobjectassignment = ':'; -static char RSJarraydelimiter = ','; - -static std::vector RSJbrackets = {RSJobjectbrackets, RSJarraybrackets}; -static std::vector RSJstringquotes = {"\"\"", "''"}; -static char RSJcharescape = '\\'; -static std::string RSJlinecommentstart = "//"; - -static std::string RSJprinttab = " "; - -enum RSJresourceType { RSJ_UNINITIATED, RSJ_UNKNOWN, RSJ_OBJECT, RSJ_ARRAY, RSJ_LEAF }; - -// ============================================================ -// Direct string manipulation functions - -inline -std::string to_string (RSJresourceType rt) { - switch (rt) { - case RSJ_UNINITIATED: return("RSJ_UNINITIATED"); - case RSJ_UNKNOWN: return("RSJ_UNKNOWN"); - case RSJ_OBJECT: return("RSJ_OBJECT"); - case RSJ_ARRAY: return("RSJ_ARRAY"); - case RSJ_LEAF: return("RSJ_LEAF"); - } -} - -enum StrTrimDir { STRTRIM_L=1, STRTRIM_R=2, STRTRIM_LR=3 }; - -inline -std::string strtrim (std::string str, std::string chars=" \t\n\r", int max_count=-1, StrTrimDir dirs=STRTRIM_LR) { - if (str.empty()) return(str); - if (max_count<0) max_count = str.length(); - - if (dirs & STRTRIM_L) { // left trim - int p; - for (p=0; p& bracks, int indx=0) { - for (int b=0; b split_RSJ_array (const std::string& str) { // TODO: Make efficient. This function is speed bottleneck. - // splits, while respecting brackets and escapes - std::vector ret; - - std::string current; - std::vector bracket_stack; - std::vector quote_stack; - bool escape_active = false; - int bi; - - for (int a=0; a 0) { // already inside string - if (str[a]==RSJcharescape) // an escape character - escape_active = !escape_active; - else if (!escape_active && str[a]==RSJstringquotes[quote_stack.back()][1] ) { // close quote - quote_stack.pop_back(); - escape_active = false; - } - else - escape_active = false; - - current.push_back (str[a]); - continue; // to * - } - - if (quote_stack.size()==0) { // check for start of string - if ((bi = is_bracket (str[a], RSJstringquotes)) >= 0) { - quote_stack.push_back (bi); - current.push_back (str[a]); - continue; // to * - } - } - - // ------------------------------------ - // checks for comments - - if (quote_stack.size()==0) { // comment cannot start inside string - - // single-line commenst - if (str.compare (a, RSJlinecommentstart.length(), RSJlinecommentstart) == 0) { - // ignore until end of line - int newline_pos = str.find ("\n", a); - if (newline_pos == std::string::npos) - newline_pos = str.find ("\r", a); - - if (newline_pos != std::string::npos) - a = newline_pos; // point to the newline character (a will be incremented) - else // the comment continues until EOF - a = str.length(); - continue; - } - } - - // ------------------------------------ - // checks for brackets - - if ( bracket_stack.size()>0 && str[a]==RSJbrackets[bracket_stack.back()][1] ) { // check for closing bracket - bracket_stack.pop_back(); - current.push_back (str[a]); - continue; - } - - if ((bi = is_bracket (str[a], RSJbrackets)) >= 0) { - bracket_stack.push_back (bi); - current.push_back (str[a]); - continue; // to * - } - - // ------------------------------------ - // otherwise - current.push_back (str[a]); - } - - if (current.length() > 0) - ret.push_back (current); - - return (ret); -} - -inline -std::string insert_tab_after_newlines (std::string str) { - for (int a=0; a RSJobject; -typedef std::vector RSJarray; - -// ------------------------------------ -// Main classes - -class RSJresource { -/* Use: RSJresource("RSJ_string_data").as()["keyName"].as()[2].as() - RSJresource("RSJ_string_data")["keyName"][2].as() */ -private: - // main data - std::string data; // can be object, vector or leaf data - bool _exists; // whether the RSJ resource exists. - - // parsed data - RSJparsedData* parsed_data_p; - -public: - // constructor - RSJresource () : _exists (false), parsed_data_p (NULL) { } // no data field. - - RSJresource (std::string str) : data (str), _exists (true), parsed_data_p (NULL) { } - RSJresource (const char* str) : RSJresource(std::string(str)) { } - - // other convertion - template - RSJresource (dataType d) : RSJresource(std::to_string(d)) { } - - // read from file and stream - RSJresource (std::istream& is) : _exists (true), parsed_data_p (NULL) { - data = std::string ( (std::istreambuf_iterator(is)), (std::istreambuf_iterator()) ); - } - RSJresource (std::ifstream& ifs) : _exists (true), parsed_data_p (NULL) { - std::istream& is = ifs; - data = std::string ( (std::istreambuf_iterator(is)), (std::istreambuf_iterator()) ); - } - - RSJresource (zstr::ifstream& ifs) : _exists (true), parsed_data_p (NULL) { - data = std::string((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); - } - - // free allocated memory for parsed data - ~RSJresource(); - - // deep copy - RSJresource (const RSJresource& r); - RSJresource& operator= (const RSJresource& r); - - // ------------------------------------ - // parsers (old) - RSJresourceType parse (bool force=false); - void parse_full (bool force=false, int max_depth=INT_MAX, int* parse_count_for_verbose_p=NULL); // recursively parse the entire JSON text - // parser (new) - void fast_parse (std::string* str_p=NULL, bool copy_string=false, int max_depth=INT_MAX, int* parse_start_str_pos=NULL); // TODO: finish. - - RSJobject& as_object (bool force=false); - RSJarray& as_array (bool force=false); - - // ------------------------------------ - - // access raw data and other attributes - int size(void); - std::string& raw_data (void) { return (data); } - bool exists (void) { return (_exists); } - bool is_parsed (void) { return (parsed_data_p!=NULL); } - RSJresourceType type (void); - // emitter - std::string as_str (bool print_comments=false, bool update_data=true); - void print (bool print_comments=false, bool update_data=true) - { std::cout << as_str(print_comments,update_data) << std::endl; } - - // opertor[] - RSJresource& operator[] (std::string key); // object - RSJresource& operator[] (int indx); // array - - // ------------------------------------ - - // as - template - dataType as (const dataType& def = dataType()) { // specialized outside class declaration - if (!exists()) return (def); - return dataType (data); // default behavior for unknown types: invoke 'dataType(std::string)' - } - - // as_vector - template > // vectorType should have push_back method - vectorType as_vector (const vectorType& def = vectorType()); - - // as_map - template > // mapType should have operator[] defined - mapType as_map (const mapType& def = mapType()); -}; - -// ------------------------------------------------------------ - -class RSJparsedData { -public: - RSJobject object; - RSJarray array; - - RSJresourceType type; - RSJparsedData() : type(RSJ_UNKNOWN) {} - - // parser (single-level) - void parse (const std::string& data, RSJresourceType typ = RSJ_UNKNOWN) { - std::string content = strtrim(data); - - if (typ==RSJ_OBJECT || typ==RSJ_UNKNOWN) { - // parse as object: - content = strtrim (strtrim (content, "{", 1, STRTRIM_L ), "}", 1, STRTRIM_R ); - if (content.length() != data.length()) { // a valid object - std::vector nvPairs = split_RSJ_array (content); - for (int a=0; a 0) { - type = RSJ_OBJECT; - return; - } - } - } - - if (typ==RSJ_ARRAY || typ==RSJ_UNKNOWN) { - // parse as array - content = strtrim (strtrim (content, "[", 1, STRTRIM_L ), "]", 1, STRTRIM_R ); - if (content.length() != data.length()) { // a valid array - std::vector nvPairs = split_RSJ_array (content); - for (int a=0; a 0) { - type = RSJ_ARRAY; - return; - } - } - } - - if (typ==RSJ_UNKNOWN) - type = RSJ_LEAF; - } - - - // remove non-existing items inserted due to accessing - int cleanup(void) { - - if (type==RSJ_OBJECT) { - bool found = true; - while (found) { - found = false; - for (auto it=object.begin(); it!=object.end(); ++it) - if (!(it->second.exists())) { - object.erase(it); - found = true; - break; // break for loop since it is now invalid - } - } - return (object.size()); - } - - if (type==RSJ_ARRAY) { // erases only the non-existent elements at the tail - while (!(array[array.size()-1].exists())) - array.pop_back(); - return (array.size()); - } - - if (type==RSJ_LEAF) - return (1); - - return (0); - } - - // size - int size(void) { return (cleanup()); } -}; - - -// ------------------------------------------------------------ -// RSJresource member functions - -inline -RSJresource::~RSJresource (){ - if (parsed_data_p) delete parsed_data_p; -} - -inline -RSJresource::RSJresource (const RSJresource& r) { - data=r.data; - _exists = r._exists; - if(r.parsed_data_p) parsed_data_p = new RSJparsedData(*(r.parsed_data_p)); - else parsed_data_p = NULL; -} - -inline -RSJresource& RSJresource::operator= (const RSJresource& r) { - data=r.data; - _exists = r._exists; - if(r.parsed_data_p) parsed_data_p = new RSJparsedData(*(r.parsed_data_p)); - else parsed_data_p = NULL; - return *this; -} - -inline -int RSJresource::size (void) { - if (!exists()) return (0); - parse(); // parse if not parsed - return (parsed_data_p->size()); -} - -inline -RSJresourceType RSJresource::type (void) { - if (!exists()) return (RSJ_UNINITIATED); - parse(); // parse if not parsed - return (parsed_data_p->type); -} - -inline -std::string RSJresource::as_str (bool print_comments, bool update_data) { - if (exists()) { - std::string ret; - parse(); // parse if not parsed - parsed_data_p->cleanup(); - - if (parsed_data_p->type==RSJ_OBJECT) { - ret = "{\n"; - for (auto it=parsed_data_p->object.begin(); it!=parsed_data_p->object.end(); ++it) { - ret += RSJprinttab + "'" + it->first + "': " + insert_tab_after_newlines( it->second.as_str (print_comments, update_data) ); - if (std::next(it) != parsed_data_p->object.end()) ret += ","; - if (print_comments) - ret += " // " + to_string(it->second.type()); - ret += "\n"; - } - ret += "}"; - } - else if (parsed_data_p->type==RSJ_ARRAY) { - ret = "[\n"; - for (auto it=parsed_data_p->array.begin(); it!=parsed_data_p->array.end(); ++it) { - ret += RSJprinttab + insert_tab_after_newlines( it->as_str (print_comments, update_data) ); - if (std::next(it) != parsed_data_p->array.end()) ret += ","; - if (print_comments) - ret += " // " + to_string(it->type()); - ret += "\n"; - } - ret += "]"; - } - else // RSJ_LEAF or RSJ_UNKNOWN - ret = strtrim (data); - - if (update_data) data = ret; - return (ret); - } - else - return (""); -} - -// Parsers - -inline -RSJresourceType RSJresource::parse (bool force) { - if (!parsed_data_p) parsed_data_p = new RSJparsedData; - if (parsed_data_p->type==RSJ_UNKNOWN || force) parsed_data_p->parse (data, RSJ_UNKNOWN); - return (parsed_data_p->type); -} - -inline -void RSJresource::parse_full (bool force, int max_depth, int* parse_count_for_verbose_p) { // recursive parsing (slow) - if (max_depth==0) return; - if (!parsed_data_p) parsed_data_p = new RSJparsedData; - if (parsed_data_p->type==RSJ_UNKNOWN || force) parsed_data_p->parse (data, RSJ_UNKNOWN); - // verbose - if (parse_count_for_verbose_p) { - (*parse_count_for_verbose_p)++; - if ( (*parse_count_for_verbose_p) % 100 == 0) - std::cout << "parse_full: " << (*parse_count_for_verbose_p) << " calls." << std::endl; - } - // recursive parse children if not already parsed - if (parsed_data_p->type==RSJ_OBJECT) - for (auto it=parsed_data_p->object.begin(); it!=parsed_data_p->object.end(); ++it) - it->second.parse_full (force, max_depth-1, parse_count_for_verbose_p); - else if (parsed_data_p->type==RSJ_ARRAY) - for (auto it=parsed_data_p->array.begin(); it!=parsed_data_p->array.end(); ++it) - it->parse_full (force, max_depth-1, parse_count_for_verbose_p); -} - -// ------------------------------------------------------------ -// ============================================================ -// FAST PARSER (Under construction. DO NOT use the following functions in your application.) - -inline -int seek_next (std::string* str_p, int start_pos, char character) { - -} - -inline -void RSJresource::fast_parse (std::string* str_p, bool copy_string, int max_depth, int* parse_start_str_pos) { - // TODO: UNDER CONSTRUCTION... - - if (!str_p) - str_p = &data; - std::string& str = *str_p; - - // splits, while respecting brackets and escapes - //std::vector ret; - - //std::string current; - std::vector bracket_stack; - std::vector quote_stack; - bool escape_active = false; - int bi; - - bool initial_whitespaces = true; - bool isroot = false; - - if (!parse_start_str_pos) { - parse_start_str_pos = new int; - *parse_start_str_pos = 0; - isroot = true; - } - - int a = *parse_start_str_pos; - - while (*parse_start_str_pos < str_p->length()) { // * - - // initial whitespace characters - if (initial_whitespaces) { - if (str[a] == ' ' || str[a] == '\n' || str[a] == '\r' || str[a] == '\t' ) { - ++a; - continue; - } - else { - if (str[a] == '{') // start of object - // ... TODO: seek_next ':' - - initial_whitespaces = false; - } - } - - - // delimiter - if ( bracket_stack.size()==0 && quote_stack.size()==0 && str[a]==RSJarraydelimiter ) { - //ret.push_back (current); - - //current.clear(); - bracket_stack.clear(); quote_stack.clear(); escape_active = false; - continue; // to * - } - - // ------------------------------------ - // checks for string - - if (quote_stack.size() > 0) { // already inside string - if (str[a]==RSJcharescape) // an escape character - escape_active = !escape_active; - else if (!escape_active && str[a]==RSJstringquotes[quote_stack.back()][1] ) { // close quote - quote_stack.pop_back(); - escape_active = false; - } - else - escape_active = false; - - //current.push_back (str[a]); - continue; // to * - } - - if (quote_stack.size()==0) { // check for start of string - if ((bi = is_bracket (str[a], RSJstringquotes)) >= 0) { - quote_stack.push_back (bi); - //current.push_back (str[a]); - continue; // to * - } - } - - // ------------------------------------ - // checks for comments - - if (quote_stack.size()==0) { // comment cannot start inside string - - // single-line commenst - if (str.compare (a, RSJlinecommentstart.length(), RSJlinecommentstart) == 0) { - // ignore until end of line - int newline_pos = str.find ("\n", a); - if (newline_pos == std::string::npos) - newline_pos = str.find ("\r", a); - - if (newline_pos != std::string::npos) - a = newline_pos; // point to the newline character (a will be incremented) - else // the comment continues until EOF - a = str.length(); - continue; - } - } - - // ------------------------------------ - // checks for brackets - - if ( bracket_stack.size()>0 && str[a]==RSJbrackets[bracket_stack.back()][1] ) { // check for closing bracket - bracket_stack.pop_back(); - //current.push_back (str[a]); - continue; - } - - if ((bi = is_bracket (str[a], RSJbrackets)) >= 0) { - bracket_stack.push_back (bi); - //current.push_back (str[a]); - continue; // to * - } - - // ------------------------------------ - // otherwise - //current.push_back (str[a]); - } - - /*if (current.length() > 0) - ret.push_back (current); */ - - if (isroot) - delete parse_start_str_pos; - - // return (ret); -} - -// ============================================================ - -// ------------------------------------------------------------ - -inline -RSJobject& RSJresource::as_object (bool force) { - if (!parsed_data_p) parsed_data_p = new RSJparsedData; - if (parsed_data_p->type==RSJ_UNKNOWN || force) parsed_data_p->parse (data, RSJ_OBJECT); - return (parsed_data_p->object); -} - -inline -RSJresource& RSJresource::operator[] (std::string key) { // returns reference - return ( (as_object())[key] ); // will return empty resource (with _exists==false) if - // either this resource does not exist, is not an object, or the key does not exist -} - -inline -RSJarray& RSJresource::as_array (bool force) { - if (!parsed_data_p) parsed_data_p = new RSJparsedData; - if (parsed_data_p->type==RSJ_UNKNOWN || force) parsed_data_p->parse (data, RSJ_ARRAY); - return (parsed_data_p->array); -} - -inline -RSJresource& RSJresource::operator[] (int indx) { // returns reference - as_array(); - if (indx >= parsed_data_p->array.size()) - parsed_data_p->array.resize(indx+1); // insert empty resources - return (parsed_data_p->array[indx]); // will return empty resource (with _exists==false) if - // either this resource does not exist, is not an object, or the key does not exist -} - -// ------------------------------------------------------------ -// special 'as': - -template inline -vectorType RSJresource::as_vector (const vectorType& def) { // returns copy -- for being consistent with other 'as' specializations - if (!exists()) return (def); - vectorType ret; - as_array(); - for (auto it=parsed_data_p->array.begin(); it!=parsed_data_p->array.end(); ++it) - ret.push_back (it->as()); - return (ret); -} - -template inline -mapType RSJresource::as_map (const mapType& def) { // returns copy -- for being consistent with other 'as' specializations - if (!exists()) return (def); - mapType ret; - as_object(); - for (auto it=parsed_data_p->object.begin(); it!=parsed_data_p->object.end(); ++it) - ret[it->first] = it->second.as(); - return (ret); -} - -// ============================================================ -// Specialized .as() member functions - -// Helper preprocessor directives -#define rsjObject as() -#define rsjArray as() -#define rsjAs(t) as() - - -// RSJobject -template <> inline -RSJobject RSJresource::as (const RSJobject& def) { // returns copy -- for being consistent with other 'as' specializations - if (!exists()) return (def); - return (as_object()); -} - -// RSJarray -template <> inline -RSJarray RSJresource::as (const RSJarray& def) { // returns copy -- for being consistent with other 'as' specializations - if (!exists()) return (def); - return (as_array()); -} - -// ------------------------------------ -// Elementary types - -// String -template <> inline -std::string RSJresource::as (const std::string& def) { - if (!exists()) return (def); - - char qq = '\0'; - std::string ret = strip_outer_quotes (data, &qq); - - std::vector< std::vector > escapes = { {"\\n","\n"}, {"\\r","\r"}, {"\\t","\t"}, {"\\\\","\\"} }; - if (qq=='"') - escapes.push_back ({"\\\"","\""}); - else if (qq=='\'') - escapes.push_back ({"\\'","'"}); - - for (int a=0; a inline -int RSJresource::as (const int& def) { - if (!exists()) return (def); - return (atoi (strip_outer_quotes(data).c_str() ) ); -} - -// double -template <> inline -double RSJresource::as (const double& def) { - if (!exists()) return (def); - return (atof (strip_outer_quotes(data).c_str() ) ); -} - -// bool -template <> inline -bool RSJresource::as (const bool& def) { - if (!exists()) return (def); - std::string cleanData = strip_outer_quotes (data); - if (cleanData=="true" || cleanData=="TRUE" || cleanData=="True" || atoi(cleanData.c_str())!=0) return (true); - return (false); -} - -// ------------------------------------ -// Other types - -/*template <> template inline -bool RSJresource::as< std::vector > (const std::vector& def) { - return as_vector (def); -} - -template <> template inline -std::unordered_map RSJresource::as< std::unordered_map > - (const std::unordered_map& def) { - return as_map (def); -}*/ - -#endif diff --git a/lib/json_parser/lib/CMakeLists.txt b/lib/json_parser/lib/CMakeLists.txt new file mode 100644 index 0000000..0f313e9 --- /dev/null +++ b/lib/json_parser/lib/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.0) + +add_library(cpp-json INTERFACE +) + +target_include_directories(cpp-json + INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +target_sources(cpp-json INTERFACE + ${CMAKE_CURRENT_SOURCE_DIR}/include/cpp-json/json.h +) + diff --git a/lib/json_parser/lib/include/cpp-json/json.h b/lib/json_parser/lib/include/cpp-json/json.h new file mode 100644 index 0000000..fab392b --- /dev/null +++ b/lib/json_parser/lib/include/cpp-json/json.h @@ -0,0 +1,2330 @@ + +#ifndef JSON_20110525_H_ +#define JSON_20110525_H_ + +/* TODO(eteran): support unicode + 00 00 00 xx UTF-32BE + 00 xx 00 xx UTF-16BE + xx 00 00 00 UTF-32LE + xx 00 xx 00 UTF-16LE + xx xx xx xx UTF-8 +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __cplusplus >= 201703L +#include +#include +#else +#include +#include +#endif + +namespace json { + +constexpr int IndentWidth = 4; + +#if __cplusplus >= 201703L +namespace NS = std; +#else +namespace NS = boost; +#endif + +class value; +class array; +class object; +class ptr; + +using object_entry = std::pair; +using object_pointer = std::shared_ptr; +using array_pointer = std::shared_ptr; + +// type testing +inline bool is_string(const value &v) noexcept; +inline bool is_bool(const value &v) noexcept; +inline bool is_number(const value &v) noexcept; +inline bool is_object(const value &v) noexcept; +inline bool is_array(const value &v) noexcept; +inline bool is_null(const value &v) noexcept; + +// conversion (you get a copy) +inline std::string to_string(const value &v); +inline bool to_bool(const value &v); +inline object to_object(const value &v); +inline array to_array(const value &v); + +template ::value>::type> +T to_number(const value &v); + +// interpretation (you get a reference) +inline object & as_object(value &v); +inline array & as_array(value &v); +inline std::string & as_string(value &v); +inline const object & as_object(const value &v); +inline const array & as_array(const value &v); +inline const std::string &as_string(const value &v); + +// does the given object have a given key? +inline bool has_key(const value &v, const std::string &key) noexcept; +inline bool has_key(const object &o, const std::string &key) noexcept; + +// create a value from some JSON +template +inline value parse(In first, In last); +inline value parse(std::istream &is); +inline value parse(std::istream &&is); +inline value parse(NS::string_view s); + +// convert a value to a JSON string +enum Options { + None = 0x00, + EscapeUnicode = 0x01, + PrettyPrint = 0x02, +}; + +constexpr inline Options operator&(Options lhs, Options rhs) noexcept { + using T = std::underlying_type::type; + return static_cast(static_cast(lhs) & static_cast(rhs)); +} + +constexpr inline Options operator|(Options lhs, Options rhs) noexcept { + using T = std::underlying_type::type; + return static_cast(static_cast(lhs) | static_cast(rhs)); +} + +template ::value || std::is_same::value || std::is_same::value>::type> +std::string stringify(const T &v, Options options = Options::None); + +template ::value || std::is_same::value || std::is_same::value>::type> +void stringify(std::ostream &os, const T &v, Options options = Options::None); + +// general error +class exception { +public: + int line = -1; + int column = -1; +}; + +// parsing errors +class boolean_expected : public exception {}; +class brace_expected : public exception {}; +class bracket_expected : public exception {}; +class colon_expected : public exception {}; +class hex_character_expected : public exception {}; +class quote_expected : public exception {}; +class invalid_unicode_character : public exception {}; +class keyword_expected : public exception {}; +class string_expected : public exception {}; +class value_expected : public exception {}; +class utf16_surrogate_expected : public exception {}; +class invalid_number : public exception {}; +class invalid_utf8_string : public exception {}; + +// usage errors +class invalid_type_cast : public exception {}; +class invalid_index : public exception {}; + +// pointer errors +class invalid_path : public exception {}; +class empty_reference_token : public exception {}; +class invalid_reference_escape : public exception {}; + +namespace detail { + +/** + * @brief to_hex + * @param ch + * @return + */ +template +unsigned int to_hex(Ch ch) { + + static const unsigned int hexval[256] = { + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; + + if (static_cast(ch) < 256) { + return hexval[static_cast(ch)]; + } else { + return 0; + } +} + +/** + * @brief surrogate_pair_to_utf8 + * @param w1 + * @param w2 + * @param out + */ +template +void surrogate_pair_to_utf8(uint16_t w1, uint16_t w2, Out &out) { + + uint32_t cp; + if ((w1 & 0xfc00) == 0xd800) { + if ((w2 & 0xfc00) == 0xdc00) { + cp = 0x10000 + (((static_cast(w1) & 0x3ff) << 10) | (w2 & 0x3ff)); + } else { + throw invalid_unicode_character(); + } + } else { + cp = w1; + } + + if (cp < 0x80) { + *out++ = static_cast(cp); + } else if (cp < 0x0800) { + *out++ = static_cast(0xc0 | ((cp >> 6) & 0x1f)); + *out++ = static_cast(0x80 | (cp & 0x3f)); + } else if (cp < 0x10000) { + *out++ = static_cast(0xe0 | ((cp >> 12) & 0x0f)); + *out++ = static_cast(0x80 | ((cp >> 6) & 0x3f)); + *out++ = static_cast(0x80 | (cp & 0x3f)); + } else if (cp < 0x1fffff) { + *out++ = static_cast(0xf0 | ((cp >> 18) & 0x07)); + *out++ = static_cast(0x80 | ((cp >> 12) & 0x3f)); + *out++ = static_cast(0x80 | ((cp >> 6) & 0x3f)); + *out++ = static_cast(0x80 | (cp & 0x3f)); + } +} + +template +struct to_number_helper {}; + +template <> +struct to_number_helper { + float convert(const value &v) { return stof(as_string(v), nullptr); } +}; +template <> +struct to_number_helper { + double convert(const value &v) { return stod(as_string(v), nullptr); } +}; + +template <> +struct to_number_helper { + uint8_t convert(const value &v) { return static_cast(stoul(as_string(v), nullptr)); } +}; +template <> +struct to_number_helper { + uint16_t convert(const value &v) { return static_cast(stoul(as_string(v), nullptr)); } +}; +template <> +struct to_number_helper { + uint32_t convert(const value &v) { return static_cast(stoul(as_string(v), nullptr)); } +}; +template <> +struct to_number_helper { + uint64_t convert(const value &v) { return stoull(as_string(v), nullptr); } +}; + +template <> +struct to_number_helper { + int8_t convert(const value &v) { return static_cast(stol(as_string(v), nullptr)); } +}; +template <> +struct to_number_helper { + int16_t convert(const value &v) { return static_cast(stol(as_string(v), nullptr)); } +}; +template <> +struct to_number_helper { + int32_t convert(const value &v) { return static_cast(stol(as_string(v), nullptr)); } +}; +template <> +struct to_number_helper { + int64_t convert(const value &v) { return stoll(as_string(v), nullptr); } +}; +} + +template +T to_number(const value &v) { + if (!is_number(v)) { + throw invalid_type_cast(); + } + + detail::to_number_helper helper; + return helper.convert(v); +} + +/** + * @brief The ptr class + */ +class ptr { +private: + using C = std::vector; + +public: + using allocator_type = typename C::allocator_type; + using reference = typename C::reference; + using const_reference = typename C::const_reference; + using pointer = typename C::pointer; + using const_pointer = typename C::const_pointer; + using iterator = typename C::iterator; + using const_iterator = typename C::const_iterator; + using reverse_iterator = typename C::reverse_iterator; + using const_reverse_iterator = typename C::const_reverse_iterator; + using difference_type = typename C::difference_type; + using size_type = typename C::size_type; + +public: + explicit ptr(NS::string_view path) { + + auto it = path.begin(); + + bool uri_format = false; + + if (it != path.end()) { + + // normal or URI fragment notation? + if (*it == '#') { + ++it; + uri_format = true; + } + + while (it != path.end()) { + if (*it++ != '/') { + throw invalid_path(); + } + + std::string reference_token; + while (it != path.end() && *it != '/') { + char ch = *it; + + if (!uri_format) { + if (ch == '~') { + + // ~1 -> / + // ~0 -> ~ + + ++it; + if (it == path.end()) { + throw invalid_reference_escape(); + } + + switch (*it) { + case '0': + ch = '~'; + break; + case '1': + ch = '/'; + break; + default: + throw invalid_reference_escape(); + } + } + } else { + // %XX -> char(0xXX) + + if (ch == '%') { + ++it; + if (it == path.end()) { + throw invalid_reference_escape(); + } + + char hex[2]; + if (!isxdigit(*it)) { + throw invalid_reference_escape(); + } + + hex[0] = *it++; + if (it == path.end()) { + throw invalid_reference_escape(); + } + + if (!isxdigit(*it)) { + throw invalid_reference_escape(); + } + + hex[1] = *it; + + ch = static_cast((detail::to_hex(hex[0]) << 4) | (detail::to_hex(hex[1]))); + } else if (ch == '~') { + // ~1 -> / + // ~0 -> ~ + + ++it; + if (it == path.end()) { + throw invalid_reference_escape(); + } + + switch (*it) { + case '0': + ch = '~'; + break; + case '1': + ch = '/'; + break; + default: + throw invalid_reference_escape(); + } + } + } + + reference_token.push_back(ch); + ++it; + } + + path_.push_back(reference_token); + } + } + } + +public: + ptr() = default; + ptr(ptr &&other) = default; + ptr(const ptr &other) = default; + ptr &operator=(ptr &&rhs) = default; + ptr &operator=(const ptr &rhs) = default; + +public: + iterator begin() noexcept { return path_.begin(); } + iterator end() noexcept { return path_.end(); } + const_iterator begin() const noexcept { return path_.begin(); } + const_iterator end() const noexcept { return path_.end(); } + const_iterator cbegin() const noexcept { return path_.begin(); } + const_iterator cend() const noexcept { return path_.end(); } + reverse_iterator rbegin() noexcept { return path_.rbegin(); } + reverse_iterator rend() noexcept { return path_.rend(); } + const_reverse_iterator rbegin() const noexcept { return path_.rbegin(); } + const_reverse_iterator rend() const noexcept { return path_.rend(); } + const_reverse_iterator crbegin() const noexcept { return path_.rbegin(); } + const_reverse_iterator crend() const noexcept { return path_.rend(); } + +public: + size_type size() const noexcept { return path_.size(); } + size_type max_size() const noexcept { return path_.max_size(); } + bool empty() const noexcept { return path_.empty(); } + +public: + value operator[](std::size_t n) const; + value &operator[](std::size_t n); + value at(std::size_t n) const; + value &at(std::size_t n); + +private: + C path_; +}; + +/** + * @brief The object class + */ +class object { + friend bool operator==(const object &lhs, const object &rhs) noexcept; + friend bool operator!=(const object &lhs, const object &rhs) noexcept; + + template + friend class parser; + +private: + using C = std::vector; + +public: + using allocator_type = typename C::allocator_type; + using reference = typename C::reference; + using const_reference = typename C::const_reference; + using pointer = typename C::pointer; + using const_pointer = typename C::const_pointer; + using iterator = typename C::iterator; + using const_iterator = typename C::const_iterator; + using difference_type = typename C::difference_type; + using size_type = typename C::size_type; + +public: + object() = default; + object(const object &other) = default; + object(object &&other) = default; + object &operator=(const object &rhs) = default; + object &operator=(object &&rhs) = default; + object(std::initializer_list list); + +public: + iterator begin() noexcept { return values_.begin(); } + iterator end() noexcept { return values_.end(); } + const_iterator begin() const noexcept { return values_.begin(); } + const_iterator end() const noexcept { return values_.end(); } + const_iterator cbegin() const noexcept { return values_.begin(); } + const_iterator cend() const noexcept { return values_.end(); } + +public: + iterator find(const std::string &s) noexcept; + const_iterator find(const std::string &s) const noexcept; + +public: + size_type size() const noexcept { + return values_.size(); + } + + size_type max_size() const noexcept { + return values_.max_size(); + } + + bool empty() const noexcept { + return values_.empty(); + } + +public: + value operator[](const std::string &key) const; + value &operator[](const std::string &key); + + value at(const std::string &key) const; + value &at(const std::string &key); + +public: + template + std::pair insert(std::string key, const T &v); + + template + std::pair insert(std::string key, T &&v); + + template + std::pair insert(std::pair &&p); + +public: + void swap(object &other) noexcept; + +private: + C values_; + + // NOTE(eteran): The values are stored in insertion order above, + // but we use this map to have a fast lookup of key -> index + std::map index_map_; +}; + +inline object::iterator begin(object &obj) noexcept { + return obj.begin(); +} + +inline object::iterator end(object &obj) noexcept { + return obj.end(); +} + +inline object::const_iterator begin(const object &obj) noexcept { + return obj.begin(); +} + +inline object::const_iterator end(const object &obj) noexcept { + return obj.end(); +} + +inline object::const_iterator cbegin(const object &obj) noexcept { + return obj.begin(); +} + +inline object::const_iterator cend(const object &obj) noexcept { + return obj.end(); +} + +/** + * @brief The array class + */ +class array { + friend bool operator==(const array &lhs, const array &rhs) noexcept; + friend bool operator!=(const array &lhs, const array &rhs) noexcept; + +private: + using C = std::vector; + +public: + using allocator_type = typename C::allocator_type; + using reference = typename C::reference; + using const_reference = typename C::const_reference; + using pointer = typename C::pointer; + using const_pointer = typename C::const_pointer; + using iterator = typename C::iterator; + using const_iterator = typename C::const_iterator; + using reverse_iterator = typename C::reverse_iterator; + using const_reverse_iterator = typename C::const_reverse_iterator; + using difference_type = typename C::difference_type; + using size_type = typename C::size_type; + +public: + array() = default; + array(array &&other) = default; + array(const array &other) = default; + array &operator=(array &&rhs) = default; + array &operator=(const array &rhs) = default; + array(std::initializer_list list); + + template + array(In first, In last) { + values_.insert(values_.end(), first, last); + } + +public: + iterator begin() noexcept { return values_.begin(); } + iterator end() noexcept { return values_.end(); } + const_iterator begin() const noexcept { return values_.begin(); } + const_iterator end() const noexcept { return values_.end(); } + const_iterator cbegin() const noexcept { return values_.begin(); } + const_iterator cend() const noexcept { return values_.end(); } + reverse_iterator rbegin() noexcept { return values_.rbegin(); } + reverse_iterator rend() noexcept { return values_.rend(); } + const_reverse_iterator rbegin() const noexcept { return values_.rbegin(); } + const_reverse_iterator rend() const noexcept { return values_.rend(); } + const_reverse_iterator crbegin() const noexcept { return values_.rbegin(); } + const_reverse_iterator crend() const noexcept { return values_.rend(); } + +public: + size_type size() const noexcept { return values_.size(); } + size_type max_size() const noexcept { return values_.max_size(); } + bool empty() const noexcept { return values_.empty(); } + +public: + value operator[](std::size_t n) const; + value &operator[](std::size_t n); + value at(std::size_t n) const; + value &at(std::size_t n); + +public: + template + void push_back(T &&v) { + values_.emplace_back(std::forward(v)); + } + + template + void push_back(const T &v) { + values_.emplace_back(v); + } + + void pop_back() noexcept { + values_.pop_back(); + } + +public: + void swap(array &other) noexcept { + using std::swap; + swap(values_, other.values_); + } + +private: + C values_; +}; + +inline array::iterator begin(array &arr) noexcept { + return arr.begin(); +} + +inline array::iterator end(array &arr) noexcept { + return arr.end(); +} + +inline array::const_iterator begin(const array &arr) noexcept { + return arr.begin(); +} + +inline array::const_iterator end(const array &arr) noexcept { + return arr.end(); +} + +inline array::const_iterator cbegin(const array &arr) noexcept { + return arr.begin(); +} + +inline array::const_iterator cend(const array &arr) noexcept { + return arr.end(); +} + +inline array::reverse_iterator rbegin(array &arr) noexcept { + return arr.rbegin(); +} + +inline array::reverse_iterator rend(array &arr) noexcept { + return arr.rend(); +} + +inline array::const_reverse_iterator rbegin(const array &arr) noexcept { + return arr.rbegin(); +} + +inline array::const_reverse_iterator rend(const array &arr) noexcept { + return arr.rend(); +} + +inline array::const_reverse_iterator crbegin(const array &arr) noexcept { + return arr.rbegin(); +} + +inline array::const_reverse_iterator crend(const array &arr) noexcept { + return arr.rend(); +} + +/** + * @brief The value class + */ +class value { + friend bool to_bool(const value &v); + + friend bool operator==(const value &lhs, const value &rhs); + friend bool operator!=(const value &lhs, const value &rhs); + + template + friend class parser; + +private: + struct numeric_type {}; + // create a value from a numeric string, internal use only! + value(std::string s, const numeric_type &) + : storage_(std::move(s)), type_(type_number) { + } + +public: + // intialize from basic types + explicit value(const array &a); + explicit value(const object &o); + + value(array &&a); + value(object &&o); + + value(bool b) + : storage_(b ? Boolean::True : Boolean::False), type_(type_boolean) { + } + + // NOTE(eteran): we don't use string_view here because of the bool overload + // which necessitates that we have a const char * overload to prevent value("hello") + // from creating a "True" value. Since we need this overload anyway, no real benefit + // to using a string_view + value(const char *s) + : storage_(std::string(s)), type_(type_string) { + } + + value(std::string s) + : storage_(std::move(s)), type_(type_string) { + } + + template ::value>::type> + value(T n) + : storage_(std::to_string(n)), type_(type_number) { + } + + value(const std::nullptr_t &) + : storage_(Null()), type_(type_null) { + } + + value() + : storage_(Null()), type_(type_null) { + } + +public: + ~value() = default; + +private: + explicit value(object_pointer o); + explicit value(array_pointer a); + +public: + value(const value &other) + : storage_(other.storage_), type_(other.type_) { + } + + value(value &&other) + : storage_(std::move(other.storage_)), type_(other.type_) { + } + +public: + value &operator=(const value &rhs); + value &operator=(value &&rhs); + +public: + void swap(value &other) noexcept { + using std::swap; + swap(storage_, other.storage_); + swap(type_, other.type_); + } + +public: + enum Type { + type_invalid, + type_null, + type_boolean, + type_object, + type_array, + type_string, + type_number, + }; + + Type type() const noexcept { return type_; } + +public: + value operator[](const std::string &key) const; + value operator[](std::size_t n) const; + value &operator[](const std::string &key); + value &operator[](std::size_t n); + +public: + inline value at(std::size_t n) const; + inline value &at(std::size_t n); + inline value at(const std::string &key) const; + inline value &at(const std::string &key); + +public: + value operator[](const ptr &ptr) const; + value &operator[](const ptr &ptr); + + value &create(const ptr &ptr); + +public: + // array like interface + template + void push_back(T &&v); + + template + void push_back(const T &v); + +public: + // object like interface + template + std::pair insert(std::string key, const T &v); + + template + std::pair insert(std::string key, T &&v); + + template + std::pair insert(std::pair &&p); + +public: + // object/array like + size_t size() const { + if (is_object()) { + return as_object().size(); + } else if (is_array()) { + return as_array().size(); + } + + throw invalid_type_cast(); + } + +public: + bool is_string() const noexcept { + return (type_ == value::type_string); + } + + bool is_bool() const noexcept { + return (type_ == value::type_boolean); + } + + bool is_number() const noexcept { + return (type_ == value::type_number); + } + + bool is_object() const noexcept { + return (type_ == value::type_object); + } + + bool is_array() const noexcept { + return (type_ == value::type_array); + } + + bool is_null() const noexcept { + return (type_ == value::type_null); + } + +public: + const std::string &as_string() const { + switch (type_) { + case value::type_string: + case value::type_number: + return NS::get(storage_); + default: + throw invalid_type_cast(); + } + } + + std::string &as_string() { + switch (type_) { + case value::type_string: + case value::type_number: + return NS::get(storage_); + default: + throw invalid_type_cast(); + } + } + + const object &as_object() const { + if (type_ != type_object) { + throw invalid_type_cast(); + } + + return *NS::get(storage_); + } + + object &as_object() { + if (type_ != type_object) { + throw invalid_type_cast(); + } + + return *NS::get(storage_); + } + + const array &as_array() const { + if (type_ != type_array) { + throw invalid_type_cast(); + } + + return *NS::get(storage_); + } + + array &as_array() { + if (type_ != type_array) { + throw invalid_type_cast(); + } + + return *NS::get(storage_); + } + +private: + struct Invalid {}; + struct Null {}; + + enum class Boolean { + False, + True, + }; + + NS::variant storage_; + Type type_ = type_invalid; +}; + +inline value array::operator[](std::size_t n) const { + return at(n); +} + +inline value &array::operator[](std::size_t n) { + return at(n); +} + +inline value array::at(std::size_t n) const { + if (n < values_.size()) { + return values_[n]; + } + + throw invalid_index(); +} + +inline value &array::at(std::size_t n) { + if (n < values_.size()) { + return values_[n]; + } + + throw invalid_index(); +} + +/** + * @brief The parser class + */ +template +class parser { +public: + parser(In first, In last) + : begin_(first), cur_(first), end_(last) { + } + +public: + value parse() { + return get_value(); + } + +public: + int line() const noexcept { return line_; } + int column() const noexcept { return column_; } + +private: + static constexpr char ArrayBegin = '['; + static constexpr char ArrayEnd = ']'; + static constexpr char NameSeparator = ':'; + static constexpr char ValueSeparator = ','; + static constexpr char ObjectBegin = '{'; + static constexpr char ObjectEnd = '}'; + static constexpr char Quote = '"'; + +private: + bool get_false() { + if (read() != 'f') { + throw boolean_expected(); + } + if (read() != 'a') { + throw boolean_expected(); + } + if (read() != 'l') { + throw boolean_expected(); + } + if (read() != 's') { + throw boolean_expected(); + } + if (read() != 'e') { + throw boolean_expected(); + } + + return false; + } + + bool get_true() { + if (read() != 't') { + throw boolean_expected(); + } + if (read() != 'r') { + throw boolean_expected(); + } + if (read() != 'u') { + throw boolean_expected(); + } + if (read() != 'e') { + throw boolean_expected(); + } + + return true; + } + + std::nullptr_t get_null() { + if (read() != 'n') { + throw keyword_expected(); + } + if (read() != 'u') { + throw keyword_expected(); + } + if (read() != 'l') { + throw keyword_expected(); + } + if (read() != 'l') { + throw keyword_expected(); + } + + return nullptr; + } + + array_pointer get_array() { + auto arr = std::make_shared(); + + if (read() != ArrayBegin) { + throw bracket_expected(); + } + + // handle empty object + char tok = peek(); + if (tok == ArrayEnd) { + read(); + } else { + do { + arr->push_back(get_value()); + tok = read(); + } while (tok == ValueSeparator); + } + + if (tok != ArrayEnd) { + throw bracket_expected(); + } + + return arr; + } + + object_pointer get_object() { + auto obj = std::make_shared(); + + if (read() != ObjectBegin) { + throw brace_expected(); + } + + // handle empty object + char tok = peek(); + if (tok == ObjectEnd) { + read(); + } else { + do { + obj->insert(get_pair()); + tok = read(); + } while (tok == ValueSeparator); + } + + if (tok != ObjectEnd) { + throw brace_expected(); + } + + return obj; + } + + object_entry get_pair() { + std::string key = get_string(); + + if (read() != NameSeparator) { + throw colon_expected(); + } + + return std::make_pair(std::move(key), get_value()); + } + + std::string get_number(); + std::string get_string(); + + value get_value() { + switch (peek()) { + case ObjectBegin: + return value(get_object()); + case ArrayBegin: + return value(get_array()); + case Quote: + return value(get_string()); + case 't': + return value(get_true()); + case 'f': + return value(get_false()); + case 'n': + return value(get_null()); + default: + return value(get_number(), value::numeric_type()); + } + + throw value_expected(); + } + +private: + void update_pos() { + if (*cur_ == '\n') { + column_ = 0; + ++line_; + } else { + ++column_; + } + } + + void consume_whitespace() { + while (!at_end() && std::isspace(*cur_)) { + update_pos(); + ++cur_; + } + } + + char peek_no_consume() { + if (at_end()) { + return '\0'; + } + + return *cur_; + } + + char peek() { + // first eat up some whitespace + consume_whitespace(); + + return peek_no_consume(); + } + + char read_no_consume() { + if (at_end()) { + return '\0'; + } + + update_pos(); + return *cur_++; + } + + char read() { + // first eat up some whitespace + consume_whitespace(); + + return read_no_consume(); + } + + bool at_end() const noexcept { + return cur_ == end_; + } + +private: + In begin_; + In cur_; + In end_; + + int line_ = 1; + int column_ = 0; +}; + +template +value parse(In first, In last) { + + parser p(first, last); + + try { + return p.parse(); + } catch (exception &e) { + e.line = p.line(); + e.column = p.column(); + throw; + } +} + +inline std::string to_string(const value &v) { + return as_string(v); +} + +inline bool to_bool(const value &v) { + if (!is_bool(v)) { + throw invalid_type_cast(); + } + + return NS::get(v.storage_) == value::Boolean::True; +} + +inline object to_object(const value &v) { + return as_object(v); +} + +inline array to_array(const value &v) { + return as_array(v); +} + +inline object &as_object(array &v) { + (void)v; + throw invalid_type_cast(); +} + +inline array &as_array(object &v) { + (void)v; + throw invalid_type_cast(); +} + +inline const object &as_object(const array &v) { + (void)v; + throw invalid_type_cast(); +} + +inline const array &as_array(const object &v) { + (void)v; + throw invalid_type_cast(); +} + +inline object &as_object(value &v) { + if (!is_object(v)) { + throw invalid_type_cast(); + } + + return v.as_object(); +} + +inline const object &as_object(const value &v) { + if (!is_object(v)) { + throw invalid_type_cast(); + } + + return v.as_object(); +} + +inline array &as_array(value &v) { + if (!is_array(v)) { + throw invalid_type_cast(); + } + + return v.as_array(); +} + +inline const array &as_array(const value &v) { + if (!is_array(v)) { + throw invalid_type_cast(); + } + + return v.as_array(); +} + +const std::string &as_string(const value &v) { + if (!is_string(v) && !is_number(v)) { + throw invalid_type_cast(); + } + + return v.as_string(); +} + +std::string &as_string(value &v) { + if (!is_string(v) && !is_number(v)) { + throw invalid_type_cast(); + } + + return v.as_string(); +} + +inline bool has_key(const value &v, const std::string &key) noexcept { + if (is_object(v)) { + return has_key(as_object(v), key); + } + return false; +} + +inline bool has_key(const object &o, const std::string &key) noexcept { + return o.find(key) != o.end(); +} + +inline value parse(std::istream &&is) { + return parse(is); +} + +inline value parse(std::istream &is) { + return parse(std::istreambuf_iterator{is}, std::istreambuf_iterator{}); +} + +inline value parse(NS::string_view s) { + return parse(s.begin(), s.end()); +} + +inline bool is_string(const value &v) noexcept { + return v.is_string(); +} + +inline bool is_bool(const value &v) noexcept { + return v.is_bool(); +} + +inline bool is_number(const value &v) noexcept { + return v.is_number(); +} + +inline bool is_object(const value &v) noexcept { + return v.is_object(); +} + +inline bool is_array(const value &v) noexcept { + return v.is_array(); +} + +inline bool is_null(const value &v) noexcept { + return v.is_null(); +} + +namespace detail { + +inline std::string escape_string(NS::string_view s, Options options) { + + std::string r; + r.reserve(s.size()); + + if (options & Options::EscapeUnicode) { + struct state_t { + unsigned int + expected : 4, + seen : 4, + reserved : 24; + }; + + state_t shift_state = {0, 0, 0}; + char32_t result = 0; + + for (auto it = s.begin(); it != s.end(); ++it) { + + const auto ch = static_cast(*it); + + if (shift_state.seen == 0) { + + if ((ch & 0x80) == 0) { + switch (ch) { + case '\"': + r += "\\\""; + break; + case '\\': + r += "\\\\"; + break; +#if 0 + case '/': r += "\\/"; break; +#endif + case '\b': + r += "\\b"; + break; + case '\f': + r += "\\f"; + break; + case '\n': + r += "\\n"; + break; + case '\r': + r += "\\r"; + break; + case '\t': + r += "\\t"; + break; + default: + if (!isprint(ch)) { + r += "\\u"; + char buf[5]; + snprintf(buf, sizeof(buf), "%04X", ch); + r += buf; + } else { + r += static_cast(ch); + } + break; + } + } else if ((ch & 0xe0) == 0xc0) { + // 2 byte + result = ch & 0x1f; + shift_state.expected = 2; + shift_state.seen = 1; + } else if ((ch & 0xf0) == 0xe0) { + // 3 byte + result = ch & 0x0f; + shift_state.expected = 3; + shift_state.seen = 1; + } else if ((ch & 0xf8) == 0xf0) { + // 4 byte + result = ch & 0x07; + shift_state.expected = 4; + shift_state.seen = 1; + } else if ((ch & 0xfc) == 0xf8) { + // 5 byte + throw invalid_utf8_string(); // Restricted by RFC 3629 + } else if ((ch & 0xfe) == 0xfc) { + // 6 byte + throw invalid_utf8_string(); // Restricted by RFC 3629 + } else { + throw invalid_utf8_string(); // should never happen + } + } else if (shift_state.seen < shift_state.expected) { + if ((ch & 0xc0) == 0x80) { + result <<= 6; + result |= ch & 0x3f; + // increment the shift state + ++shift_state.seen; + + if (shift_state.seen == shift_state.expected) { + // done with this character + + char buf[5]; + + if (result < 0xd800 || (result >= 0xe000 && result < 0x10000)) { + r += "\\u"; + snprintf(buf, sizeof(buf), "%04X", result); + r += buf; + } else { + result = (result - 0x10000); + + r += "\\u"; + snprintf(buf, sizeof(buf), "%04X", 0xd800 + ((result >> 10) & 0x3ff)); + r += buf; + + r += "\\u"; + snprintf(buf, sizeof(buf), "%04X", 0xdc00 + (result & 0x3ff)); + r += buf; + } + + shift_state.seen = 0; + shift_state.expected = 0; + result = 0; + } + + } else { + throw invalid_utf8_string(); // should never happen + } + } else { + throw invalid_utf8_string(); // should never happen + } + } + } else { + + for (char ch : s) { + + switch (ch) { + case '\"': + r += "\\\""; + break; + case '\\': + r += "\\\\"; + break; +#if 0 + case '/': r += "\\/"; break; +#endif + case '\b': + r += "\\b"; + break; + case '\f': + r += "\\f"; + break; + case '\n': + r += "\\n"; + break; + case '\r': + r += "\\r"; + break; + case '\t': + r += "\\t"; + break; + default: + r += ch; + break; + } + } + } + return r; +} + +inline std::string escape_string(NS::string_view s) { + return escape_string(s, Options::None); +} + +// pretty print as a string +inline void value_to_string(std::ostream &os, const value &v, Options options, int indent, bool ignore_initial_ident); + +inline void value_to_string(std::ostream &os, const object &o, Options options, int indent, bool ignore_initial_ident) { + + if (!ignore_initial_ident) { + os << std::string(indent * IndentWidth, ' '); + } + + if (o.empty()) { + os << "{}"; + } else { + os << "{\n"; + + auto it = o.begin(); + auto e = o.end(); + + ++indent; + os << std::string(indent * IndentWidth, ' ') << '"' << escape_string(it->first, options) << "\" : "; + value_to_string(os, it->second, options, indent, true); + + ++it; + for (; it != e; ++it) { + os << ','; + os << '\n'; + os << std::string(indent * IndentWidth, ' ') << '"' << escape_string(it->first, options) << "\" : "; + value_to_string(os, it->second, options, indent, true); + } + --indent; + + os << "\n"; + os << std::string(indent * IndentWidth, ' ') << "}"; + } +} + +inline void value_to_string(std::ostream &os, const array &a, Options options, int indent, bool ignore_initial_ident) { + + if (!ignore_initial_ident) { + os << std::string(indent * IndentWidth, ' '); + } + + if (a.empty()) { + os << "[]"; + } else { + os << "[\n"; + + auto it = a.begin(); + auto e = a.end(); + + ++indent; + value_to_string(os, *it++, options, indent, false); + + for (; it != e; ++it) { + os << ','; + os << '\n'; + value_to_string(os, *it, options, indent, false); + } + --indent; + + os << "\n"; + os << std::string(indent * IndentWidth, ' ') << "]"; + } +} + +inline void value_to_string(std::ostream &os, const value &v, Options options, int indent, bool ignore_initial_ident) { + + if (!ignore_initial_ident) { + os << std::string(indent * IndentWidth, ' '); + } + + switch (v.type()) { + case value::type_string: + os << '"' << escape_string(as_string(v), options) << '"'; + break; + case value::type_number: + os << as_string(v); + break; + case value::type_null: + os << "null"; + break; + case value::type_boolean: + os << (to_bool(v) ? "true" : "false"); + break; + case value::type_object: + value_to_string(os, as_object(v), options, indent, true); + break; + case value::type_array: + value_to_string(os, as_array(v), options, indent, true); + break; + case value::type_invalid: + break; + } +} + +inline std::string value_to_string(const value &v, Options options, int indent, bool ignore_initial_ident) { + + std::stringstream ss; + value_to_string(ss, v, options, indent, ignore_initial_ident); + return ss.str(); +} + +inline std::string value_to_string(const value &v, Options options) { + return value_to_string(v, options, 0, false); +} + +inline void value_to_string(std::ostream &os, const value &v, Options options) { + value_to_string(os, v, options, 0, false); +} + +// serialize, not pretty printed +inline void serialize(std::ostream &os, const value &v, Options options); + +inline void serialize(std::ostream &os, const array &a, Options options) { + os << "["; + if (!a.empty()) { + auto it = a.begin(); + auto e = a.end(); + + serialize(os, *it++, options); + + for (; it != e; ++it) { + os << ','; + serialize(os, *it, options); + } + } + os << "]"; +} + +inline void serialize(std::ostream &os, const object &o, Options options) { + os << "{"; + if (!o.empty()) { + auto it = o.begin(); + auto e = o.end(); + + os << '"' << escape_string(it->first, options) << "\":"; + serialize(os, it->second, options); + ++it; + for (; it != e; ++it) { + os << ','; + os << '"' << escape_string(it->first, options) << "\":"; + serialize(os, it->second, options); + } + } + os << "}"; +} + +inline void serialize(std::ostream &os, const value &v, Options options) { + + switch (v.type()) { + case value::type_string: + os << '"' << escape_string(as_string(v), options) << '"'; + break; + case value::type_number: + os << as_string(v); + break; + case value::type_null: + os << "null"; + break; + case value::type_boolean: + os << (to_bool(v) ? "true" : "false"); + break; + case value::type_object: { + serialize(os, as_object(v), options); + break; + } + case value::type_array: { + serialize(os, as_array(v), options); + break; + } + case value::type_invalid: + break; + } +} + +template ::value || std::is_same::value || std::is_same::value>::type> +std::string serialize(const T &v, Options options) { + std::stringstream ss; + + std::locale c_locale("C"); + ss.imbue(c_locale); + + serialize(ss, v, options); + return ss.str(); +} + +template ::value || std::is_same::value || std::is_same::value>::type> +std::string pretty_print(const T &v, Options options) { + return value_to_string(value(v), options); +} + +template ::value || std::is_same::value || std::is_same::value>::type> +void pretty_print(std::ostream &os, const T &v, Options options) { + value_to_string(os, value(v), options); +} + +} + +template +std::string stringify(const T &v, Options options) { + if (options & Options::PrettyPrint) { + return detail::pretty_print(v, options); + } else { + return detail::serialize(v, options); + } +} + +template +void stringify(std::ostream &os, const T &v, Options options) { + + std::locale c_locale("C"); + os.imbue(c_locale); + + if (options & Options::PrettyPrint) { + detail::pretty_print(os, v, options); + } else { + detail::serialize(os, v, options); + } +} + +/** + * @brief object::swap + * @param other + */ +inline void object::swap(object &other) noexcept { + using std::swap; + swap(values_, other.values_); + swap(index_map_, other.index_map_); +} + +/** + * @brief object::object + * @param list + */ +inline object::object(std::initializer_list list) { + + for (auto &entry : list) { + insert(entry.first, entry.second); + } +} + +inline value object::operator[](const std::string &key) const { + return at(key); +} + +inline value &object::operator[](const std::string &key) { + return at(key); +} + +inline object::iterator object::find(const std::string &s) noexcept { + + auto it = index_map_.find(s); + if (it != index_map_.end()) { + return values_.begin() + it->second; + } + + return values_.end(); +} + +inline object::const_iterator object::find(const std::string &s) const noexcept { + auto it = index_map_.find(s); + if (it != index_map_.end()) { + return values_.begin() + it->second; + } + + return values_.end(); +} + +/** + * @brief object::at + * @param key + * @return + */ +inline value object::at(const std::string &key) const { + + auto it = index_map_.find(key); + if (it != index_map_.end()) { + return values_[it->second].second; + } + + throw invalid_index(); +} + +/** + * @brief object::at + * @param key + * @return + */ +inline value &object::at(const std::string &key) { + + auto it = index_map_.find(key); + if (it != index_map_.end()) { + return values_[it->second].second; + } + + throw invalid_index(); +} + +/** + * @brief object::insert + * @param p + * @return + */ +template +auto object::insert(std::pair &&p) -> std::pair { + return insert(std::move(p.first), std::move(p.second)); +} + +/** + * @brief object::insert + * @param key + * @param v + * @return + */ +template +auto object::insert(std::string key, const T &v) -> std::pair { + + auto it = find(key); + if (it != values_.end()) { + return std::make_pair(it, false); + } + + auto n = values_.emplace(it, std::move(key), value(v)); + index_map_.emplace(n->first, values_.size() - 1); + return std::make_pair(n, true); +} + +/** + * @brief object::insert + * @param key + * @param v + * @return + */ +template +auto object::insert(std::string key, T &&v) -> std::pair { + + auto it = find(key); + if (it != values_.end()) { + return std::make_pair(it, false); + } + + auto n = values_.emplace(it, std::move(key), value(std::forward(v))); + index_map_.emplace(n->first, values_.size() - 1); + return std::make_pair(n, true); +} + +/** + * @brief array::array + * @param list + */ +inline array::array(std::initializer_list list) { + for (const auto &x : list) { + values_.emplace_back(x); + } +} + +/** + * @brief value::value + * @param o + */ +inline value::value(object_pointer o) + : storage_(std::move(o)), type_(type_object) { +} + +/** + * @brief value::value + * @param a + */ +inline value::value(array_pointer a) + : storage_(std::move(a)), type_(type_array) { +} + +/** + * @brief value::operator = + * @param rhs + * @return + */ +inline value &value::operator=(value &&rhs) { + if (this != &rhs) { + storage_ = std::move(rhs.storage_); + type_ = std::move(rhs.type_); + } + + return *this; +} + +/** + * @brief value::operator = + * @param rhs + * @return + */ +inline value &value::operator=(const value &rhs) { + + if (this != &rhs) { + storage_ = rhs.storage_; + type_ = rhs.type_; + } + + return *this; +} + +/** + * @brief value::at + * @param n + * @return + */ +inline value value::at(std::size_t n) const { + return as_array().at(n); +} + +/** + * @brief value::at + * @param n + * @return + */ +inline value &value::at(std::size_t n) { + return as_array().at(n); +} + +/** + * @brief value::at + * @param key + * @return + */ +inline value value::at(const std::string &key) const { + return as_object().at(key); +} + +/** + * @brief value::at + * @param key + * @return + */ +inline value &value::at(const std::string &key) { + return as_object().at(key); +} + +/** + * @brief value::operator [] + * @param key + * @return + */ +inline value value::operator[](const std::string &key) const { + return as_object()[key]; +} + +/** + * @brief value::operator [] + * @param n + * @return + */ +inline value value::operator[](std::size_t n) const { + return as_array()[n]; +} + +/** + * @brief value::operator [] + * @param key + * @return + */ +inline value &value::operator[](const std::string &key) { + return as_object()[key]; +} + +/** + * @brief value::operator [] + * @param n + * @return + */ +inline value &value::operator[](std::size_t n) { + return as_array()[n]; +} + +inline value value::operator[](const ptr &ptr) const { + + // this cast makes sure we don't get references to temps along the way + // but the final return will create a copy + value *result = const_cast(this); + for (const std::string &ref : ptr) { + + if (result->is_object()) { + result = &result->at(ref); + } else if (result->is_array()) { + + if (ref == "-") { + result->push_back(value()); + result = &result->at(result->size() - 1); + } else { + std::size_t n = std::stoul(ref); + result = &result->at(n); + } + } else { + throw invalid_path(); + } + } + + return *result; +} + +inline value &value::operator[](const ptr &ptr) { + + value *result = this; + for (const std::string &ref : ptr) { + + if (result->is_object()) { + result = &result->at(ref); + } else if (result->is_array()) { + if (ref == "-") { + result->push_back(value()); + result = &result->at(result->size() - 1); + } else { + std::size_t n = std::stoul(ref); + result = &result->at(n); + } + } else { + throw invalid_path(); + } + } + + return *result; +} + +inline value &value::create(const ptr &ptr) { + value *result = this; + for (const std::string &ref : ptr) { + + if (result->is_object()) { + if (!has_key(result, ref)) { + result->insert(ref, object()); + } + result = &result->at(ref); + } else if (result->is_array()) { + if (ref == "-") { + result->push_back(value()); + result = &result->at(result->size() - 1); + } else { + std::size_t n = std::stoul(ref); + result = &result->at(n); + } + } else { + throw invalid_path(); + } + } + + return *result; +} + +/** + * @brief value::value + * @param a + */ +inline value::value(const array &a) + : type_(type_array) { + storage_ = std::make_shared(a); +} + +/** + * @brief value::value + * @param o + */ +inline value::value(const object &o) + : type_(type_object) { + storage_ = std::make_shared(o); +} + +/** + * @brief value::value + * @param a + */ +inline value::value(array &&a) + : type_(type_array) { + storage_ = std::make_shared(std::move(a)); +} + +/** + * @brief value::value + * @param o + */ +inline value::value(object &&o) + : type_(type_object) { + storage_ = std::make_shared(std::move(o)); +} + +/** + * @brief operator == + * @param lhs + * @param rhs + * @return + */ +inline bool operator==(const value &lhs, const value &rhs) { + if (lhs.type_ == rhs.type_) { + switch (lhs.type_) { + case value::type_string: + return as_string(lhs) == as_string(rhs); + case value::type_number: + return to_number(lhs) == to_number(rhs); + case value::type_null: + return true; + case value::type_boolean: + return to_bool(lhs) == to_bool(rhs); + case value::type_array: + return as_array(lhs) == as_array(rhs); + case value::type_object: + return as_object(lhs) == as_object(rhs); + case value::type_invalid: + break; + } + } + return false; +} + +/** + * @brief operator != + * @param lhs + * @param rhs + * @return + */ +inline bool operator!=(const value &lhs, const value &rhs) { + return !(lhs == rhs); +} + +/** + * @brief operator == + * @param lhs + * @param rhs + * @return + */ +inline bool operator==(const object &lhs, const object &rhs) noexcept { + if (lhs.values_.size() == rhs.values_.size()) { + return lhs.values_ == rhs.values_; + } + return false; +} + +/** + * @brief operator != + * @param lhs + * @param rhs + * @return + */ +inline bool operator!=(const object &lhs, const object &rhs) noexcept { + return !(lhs == rhs); +} + +/** + * @brief operator == + * @param lhs + * @param rhs + * @return + */ +inline bool operator==(const array &lhs, const array &rhs) noexcept { + if (lhs.values_.size() == rhs.values_.size()) { + return lhs.values_ == rhs.values_; + } + return false; +} + +/** + * @brief operator != + * @param lhs + * @param rhs + * @return + */ +inline bool operator!=(const array &lhs, const array &rhs) noexcept { + return !(lhs == rhs); +} + +/** + * @brief parser::get_string + * @return + */ +template +std::string parser::get_string() { + + if (read() != Quote) { + throw string_expected(); + } + + std::string s; + + std::back_insert_iterator out = back_inserter(s); + + while (peek_no_consume() != Quote && peek_no_consume() != '\n') { + + char ch = read_no_consume(); + if (ch == '\\') { + switch (read_no_consume()) { + case '"': + *out++ = '"'; + break; + case '\\': + *out++ = '\\'; + break; + case '/': + *out++ = '/'; + break; + case 'b': + *out++ = '\b'; + break; + case 'f': + *out++ = '\f'; + break; + case 'n': + *out++ = '\n'; + break; + case 'r': + *out++ = '\r'; + break; + case 't': + *out++ = '\t'; + break; + case 'u': { + // convert \uXXXX escape sequences to UTF-8 + char hex[4]; + + if (!std::isxdigit(hex[0] = read())) throw invalid_unicode_character(); + if (!std::isxdigit(hex[1] = read())) throw invalid_unicode_character(); + if (!std::isxdigit(hex[2] = read())) throw invalid_unicode_character(); + if (!std::isxdigit(hex[3] = read())) throw invalid_unicode_character(); + + uint16_t w1 = 0; + uint16_t w2 = 0; + + w1 |= (detail::to_hex(hex[0]) << 12); + w1 |= (detail::to_hex(hex[1]) << 8); + w1 |= (detail::to_hex(hex[2]) << 4); + w1 |= (detail::to_hex(hex[3])); + + if ((w1 & 0xfc00) == 0xdc00) { + throw invalid_unicode_character(); + } + + if ((w1 & 0xfc00) == 0xd800) { + // part of a surrogate pair + if (read() != '\\') { + throw utf16_surrogate_expected(); + } + + if (read() != 'u') { + throw utf16_surrogate_expected(); + } + + // convert \uXXXX escape sequences for surrogate pairs to UTF-8 + if (!std::isxdigit(hex[0] = read())) throw invalid_unicode_character(); + if (!std::isxdigit(hex[1] = read())) throw invalid_unicode_character(); + if (!std::isxdigit(hex[2] = read())) throw invalid_unicode_character(); + if (!std::isxdigit(hex[3] = read())) throw invalid_unicode_character(); + + w2 |= (detail::to_hex(hex[0]) << 12); + w2 |= (detail::to_hex(hex[1]) << 8); + w2 |= (detail::to_hex(hex[2]) << 4); + w2 |= (detail::to_hex(hex[3])); + } + + detail::surrogate_pair_to_utf8(w1, w2, out); + break; + } + default: + *out++ = '\\'; + break; + } + } else { + *out++ = ch; + } + } + + if (read() != Quote) { + throw quote_expected(); + } + + return s; +} + +/** + * @brief parser::get_number + * @return + */ +template +std::string parser::get_number() { + std::string s; + s.reserve(10); + std::back_insert_iterator out = back_inserter(s); + + // JSON numbers fit the regex: -?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][+-]?[0-9]+)? + + // -? + if (peek() == '-') { + *out++ = read(); + } + + // (0|[1-9][0-9]*) + char first_digit = peek(); + if (first_digit >= '1' && first_digit <= '9') { + do { + *out++ = read(); + } while (std::isdigit(peek())); + } else if (first_digit == '0') { + *out++ = read(); + } else { + throw invalid_number(); + } + + // (\.[0-9]+)? + if (peek() == '.') { + *out++ = read(); + if (!std::isdigit(peek())) { + throw invalid_number(); + } + + while (std::isdigit(peek())) { + *out++ = read(); + } + } + + // ([eE][+-]?[0-9]+)? + if (peek() == 'e' || peek() == 'E') { + *out++ = read(); + if (peek() == '+' || peek() == '-') { + *out++ = read(); + } + + if (!std::isdigit(peek())) { + throw invalid_number(); + } + + while (std::isdigit(peek())) { + *out++ = read(); + } + } + + return s; +} + +template +void value::push_back(T &&v) { + as_array().push_back(std::forward(v)); +} + +template +void value::push_back(const T &v) { + as_array().push_back(v); +} + +template +std::pair value::insert(std::string key, const T &v) { + return as_object().insert(std::move(key), v); +} + +template +std::pair value::insert(std::string key, T &&v) { + return as_object().insert(std::move(key), std::forward(v)); +} + +template +std::pair value::insert(std::pair &&p) { + return as_object().insert(std::forward(p)); +} + +} + +#endif diff --git a/lib/json_parser/test/CMakeLists.txt b/lib/json_parser/test/CMakeLists.txt new file mode 100644 index 0000000..669b493 --- /dev/null +++ b/lib/json_parser/test/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required (VERSION 3.0) + +add_executable(example1 example1.cpp) +add_executable(example2 example2.cpp) +add_executable(example3 example3.cpp) + +target_link_libraries(example1 PUBLIC cpp-json) +target_link_libraries(example2 PUBLIC cpp-json) +target_link_libraries(example3 PUBLIC cpp-json) + +set_property(TARGET example1 PROPERTY CXX_STANDARD 11) +set_property(TARGET example2 PROPERTY CXX_STANDARD 11) +set_property(TARGET example3 PROPERTY CXX_STANDARD 11) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic -W -Wall -Wmissing-field-initializers -Wunused -Wshadow") diff --git a/lib/json_parser/test/example1.cpp b/lib/json_parser/test/example1.cpp new file mode 100644 index 0000000..52a7795 --- /dev/null +++ b/lib/json_parser/test/example1.cpp @@ -0,0 +1,18 @@ + +#include "cpp-json/json.h" +#include +#include + +/** + * @brief main + * @return + */ +int main() { + + // construct from a file + std::ifstream file("example1.json"); + if(file) { + auto v1 = json::parse(file); + std::cout << stringify(v1, json::PrettyPrint | json::EscapeUnicode) << '\n'; + } +} diff --git a/lib/json_parser/test/example1.json b/lib/json_parser/test/example1.json new file mode 100644 index 0000000..6dcdba0 --- /dev/null +++ b/lib/json_parser/test/example1.json @@ -0,0 +1,20 @@ +{ + "test1":1, + "object1":{ + "object2":{ + "test2":2, + "object3":{ + "test3":3, + "test4":[1,2,3,4] + } + } + }, + "test\nreturn":"hello\tworld", + "u":"\u1234", + "b1":true, + "b2":false, + "n1":null, + "n2":-1234 + +} + diff --git a/lib/json_parser/test/example2.cpp b/lib/json_parser/test/example2.cpp new file mode 100644 index 0000000..a7d6655 --- /dev/null +++ b/lib/json_parser/test/example2.cpp @@ -0,0 +1,25 @@ + +#include "cpp-json/json.h" +#include + +/** + * @brief main + * @return + */ +int main() { + + // construct programmatically using object literal syntax in C++11 + auto arr = json::array { + 1, + 2, + 3, + 4, + "Testing 1 2 3", + json::object{ + { "hello", 1234 }, + { "world", 5678 } + } + }; + + std::cout << stringify(arr) << '\n'; +} diff --git a/lib/json_parser/test/example3.cpp b/lib/json_parser/test/example3.cpp new file mode 100644 index 0000000..80300f1 --- /dev/null +++ b/lib/json_parser/test/example3.cpp @@ -0,0 +1,39 @@ + +#include "cpp-json/json.h" +#include + +/** + * @brief main + * @return + */ +int main() { + + // construct from string (C++11 raw string literals work nicely here!) + auto v = json::parse(R"( + { + "test3" : { + "x" : 123.456 + }, + "test4" : [ + 1, + 2, + 3, + { + "z" : 12345.6 + } + ], + "test1" : "hello world", + "test2" : "BLAH\uD840\uDC8ABLAH" + } + )"); + + std::cout << stringify(v, json::PrettyPrint) << '\n'; + std::cout << "----------" << std::endl; + + // get a specific value + json::value z = v["test4"][3]["z"]; + std::cout << json::to_number(z) << std::endl; + std::cout << json::to_number(z) << std::endl; + + std::cout << stringify(v, json::EscapeUnicode) << '\n'; +} diff --git a/lib/json_parser/test/example_output/example1.txt b/lib/json_parser/test/example_output/example1.txt new file mode 100644 index 0000000..3e42813 --- /dev/null +++ b/lib/json_parser/test/example_output/example1.txt @@ -0,0 +1,23 @@ +{ + "n2" : -1234, + "b2" : false, + "n1" : null, + "b1" : true, + "u" : "\u1234", + "test\nreturn" : "hello\tworld", + "object1" : { + "object2" : { + "object3" : { + "test4" : [ + 1, + 2, + 3, + 4 + ], + "test3" : 3 + }, + "test2" : 2 + } + }, + "test1" : 1 +} diff --git a/lib/json_parser/test/example_output/example2.txt b/lib/json_parser/test/example_output/example2.txt new file mode 100644 index 0000000..cc24e95 --- /dev/null +++ b/lib/json_parser/test/example_output/example2.txt @@ -0,0 +1 @@ +[1,2,3,4,"Testing 1 2 3",{"world":5678,"hello":1234}] diff --git a/lib/json_parser/test/example_output/example3.txt b/lib/json_parser/test/example_output/example3.txt new file mode 100644 index 0000000..a6558ef --- /dev/null +++ b/lib/json_parser/test/example_output/example3.txt @@ -0,0 +1,19 @@ +{ + "test2" : "BLAH𠂊BLAH", + "test1" : "hello world", + "test4" : [ + 1, + 2, + 3, + { + "z" : 12345.6 + } + ], + "test3" : { + "x" : 123.456 + } +} +---------- +12345.6 +12345 +{"test2":"BLAH\uD840\uDC8ABLAH","test1":"hello world","test4":[1,2,3,{"z":12345}],"test3":{"x":123.456}} diff --git a/lib/kProcessor b/lib/kProcessor index 3356d7d..a4ebc75 160000 --- a/lib/kProcessor +++ b/lib/kProcessor @@ -1 +1 @@ -Subproject commit 3356d7dad5a8c55cebe49a63113c41ef536d1b9c +Subproject commit a4ebc75e27555812556ccd33b7ca11b46a4855e9 diff --git a/pykSpider/kSpider2/kSpider_main.py b/pykSpider/kSpider2/kSpider_main.py index fcb141d..508d98d 100644 --- a/pykSpider/kSpider2/kSpider_main.py +++ b/pykSpider/kSpider2/kSpider_main.py @@ -2,18 +2,18 @@ from kSpider2.click_context import cli from kSpider2.ks_pairwise import main as pairwise_main # pylint: disable=relative-beyond-top-level -from kSpider2.ks_index import kmers, skipmers, protein # pylint: disable=relative-beyond-top-level +# from kSpider2.ks_index import kmers, skipmers, protein # pylint: disable=relative-beyond-top-level from kSpider2.ks_clustering import main as clustering from kSpider2.ks_fastx_to_kfs import main as fastx_to_kfs from kSpider2.ks_export import main as export from kSpider2.ks_dataset_indexing import main as index_datasets -cli.add_command(kmers, name="index_kmers") -cli.add_command(skipmers, name="index_skipmers") -cli.add_command(protein, name="index_protein") +# cli.add_command(kmers, name="index_kmers") +# cli.add_command(skipmers, name="index_skipmers") +# cli.add_command(protein, name="index_protein") cli.add_command(fastx_to_kfs, name="sketch") -cli.add_command(index_datasets, name="index_datasets") +cli.add_command(index_datasets, name="index") cli.add_command(pairwise_main, name="pairwise") cli.add_command(clustering, name="cluster") cli.add_command(export, name="export") diff --git a/pykSpider/kSpider2/ks_clustering.py b/pykSpider/kSpider2/ks_clustering.py index c20a849..6a8f11b 100644 --- a/pykSpider/kSpider2/ks_clustering.py +++ b/pykSpider/kSpider2/ks_clustering.py @@ -208,7 +208,7 @@ def export_kCluster(self): """ -@cli.command(name="cluster", help_priority=7) +@cli.command(name="cluster", help_priority=4) @click.option('-c', '--cutoff', required=False, type=click.FloatRange(0, 1, clamp=False), default=0.0, show_default=True, help="cluster sequences with (containment > cutoff)") @click.option('-i', '--index-prefix', "index_prefix", required=True, type=click.STRING, help="Index file prefix") @click.pass_context diff --git a/pykSpider/kSpider2/ks_dataset_indexing.py b/pykSpider/kSpider2/ks_dataset_indexing.py index e665a7d..348146c 100644 --- a/pykSpider/kSpider2/ks_dataset_indexing.py +++ b/pykSpider/kSpider2/ks_dataset_indexing.py @@ -9,7 +9,7 @@ from glob import glob -@cli.command(name="index_datasets", help_priority=5) +@cli.command(name="index", help_priority=2) @click.option('--dir', "sketches_dir", required = True, help="Sketches directory (must contain only the sketches)") @click.option('-k', '--kmer-size', "kSize", required=False, default = 0, type=click.INT, help="kmer size (only if using --sourmash)") @click.option('--sourmash', "sourmash", is_flag=True, show_default=True, default=False, help="use sourmash sigs instead of kProcessor") diff --git a/pykSpider/kSpider2/ks_export.py b/pykSpider/kSpider2/ks_export.py index 327d892..f753eec 100644 --- a/pykSpider/kSpider2/ks_export.py +++ b/pykSpider/kSpider2/ks_export.py @@ -38,7 +38,7 @@ def get_newick(node, parent_dist, leaf_names, newick='') -> str: return newick -@cli.command(name="export", help_priority=8) +@cli.command(name="export", help_priority=5) @click.option('-i', '--index-prefix', required=True, type=click.STRING, help="Index file prefix") @click.option('--dist-mat', "distance_matrix", is_flag=True, help="Convert pairwise matrix to NxN distance matrix", default=False) @click.option('--newick', "newick", is_flag=True, help="Convert pairwise (containment) matrix to newick format", default=False) diff --git a/pykSpider/kSpider2/ks_fastx_to_kfs.py b/pykSpider/kSpider2/ks_fastx_to_kfs.py index 1287ba4..54c53eb 100644 --- a/pykSpider/kSpider2/ks_fastx_to_kfs.py +++ b/pykSpider/kSpider2/ks_fastx_to_kfs.py @@ -9,7 +9,7 @@ import os -@cli.command(name="sketch", help_priority=4) +@cli.command(name="sketch", help_priority=1) @click.option('-c', '--chunk-size', "chunk_size", required=False, type=click.INT, default=3000, help="chunk size") @click.option('-k', '--kmer-size', "kSize", required=True, type=click.IntRange(7, 31, clamp=False), help="kmer size") @click.option('--fastx', "fastx", type=click.Path(exists=True), help = "FASTX file path, works with interleaved paired-end and protein", required= False) @@ -22,7 +22,7 @@ @click.pass_context def main(ctx, fastx, r1, r2, chunk_size, kSize, protein, dayhoff, downsampling_ratio, singletones): """ - Sketch sequence files. + Sketch a FASTA/Q file. You can use sourmash if required scale >100. """ if protein and (r1 or r2): diff --git a/pykSpider/kSpider2/ks_pairwise.py b/pykSpider/kSpider2/ks_pairwise.py index c323fe6..97d6bfb 100644 --- a/pykSpider/kSpider2/ks_pairwise.py +++ b/pykSpider/kSpider2/ks_pairwise.py @@ -1,28 +1,23 @@ #!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import division - import sys - -from traitlets import default - import _kSpider_internal as kSpider_internal - import click from kSpider2.click_context import cli -@cli.command(name="pairwise", help_priority=6) +@cli.command(name="pairwise", help_priority=3) @click.option('-i', '--index-prefix', required=True, type=click.STRING, help="Index file prefix") -@click.option('-t', '--threads', "user_threads", default = 1, required=False, type=int, help="number of cores") - +@click.option('-t', '--threads', "user_threads", default=1, required=False, type=int, help="number of cores") @click.pass_context def main(ctx, index_prefix, user_threads): """ Generate containment pairwise matrix. """ - ctx.obj.INFO(f"Constructing the containment pairwise matrix using {user_threads} cores.") + ctx.obj.INFO( + f"Constructing the containment pairwise matrix using {user_threads} cores.") kSpider_internal.pairwise(index_prefix, user_threads) diff --git a/sig_to_bin.cpp b/sig_to_bin.cpp new file mode 100644 index 0000000..efee58f --- /dev/null +++ b/sig_to_bin.cpp @@ -0,0 +1,67 @@ +#include +#include +#include +#include "parallel_hashmap/phmap.h" +#include +#include +#include +#include +#include +#include "parallel_hashmap/phmap_dump.h" +#include +#include "cpp-json/json.h" +#include "zstr.hpp" + +using namespace std; +// using namespace phmap; + +typedef std::chrono::high_resolution_clock Time; + + +int main(int argc, char** argv) { + + if (argc != 5) { + cout << "run: ./sig_to_bin " << endl; + exit(1); + } + + string sig_path = argv[1]; + int kSize = stoi(argv[2]); + int min_abundance = stoi(argv[3]); + string output_path = argv[4]; + + auto begin_time = Time::now(); + + phmap::flat_hash_set tmp_hashes; + + zstr::ifstream sig_stream(sig_path); + json::value json = json::parse(sig_stream); + auto sourmash_sig = json[0]["signatures"]; + const json::array& sig_array = as_array(sourmash_sig); + for (auto it = sig_array.begin(); it != sig_array.end(); ++it) { + const json::value& v = *it; + if (v["ksize"] == kSize) { + const json::array& mins = as_array(v["mins"]); + const json::array& abundances = as_array(v["abundances"]); + auto mins_it = mins.begin(); + auto abund_it = abundances.begin(); + while (mins_it != mins.end()) { + if (json::to_number(*abund_it) >= min_abundance) + tmp_hashes.insert(json::to_number(*mins_it)); + + mins_it++; + abund_it++; + } + } + break; + } + + + cout << "inserted " << tmp_hashes.size() << " hashes." << endl; + string out_path = output_path; + phmap::BinaryOutputArchive ar_out(out_path.c_str()); + tmp_hashes.phmap_dump(ar_out); + cout << "Conversion done in " << std::chrono::duration(Time::now() - begin_time).count() / 1000 << " secs" << endl; + + +} \ No newline at end of file diff --git a/sigs_to_bins.cpp b/sigs_to_bins.cpp new file mode 100644 index 0000000..b297086 --- /dev/null +++ b/sigs_to_bins.cpp @@ -0,0 +1,143 @@ +#include +#include +#include +#include "parallel_hashmap/phmap.h" +#include +#include +#include "cpp-json/json.h" +#include "zstr.hpp" +#include +#include +#include +#include "parallel_hashmap/phmap_dump.h" +#include +#include +#include + +using namespace std; + +typedef std::chrono::high_resolution_clock Time; + +inline bool file_exists(const std::string& name) { + struct stat buffer; + return (stat(name.c_str(), &buffer) == 0); +} + + +std::vector glob2(const std::string& pattern) { + using namespace std; + + // glob struct resides on the stack + glob_t glob_result; + memset(&glob_result, 0, sizeof(glob_result)); + + // do the glob operation + int return_value = glob(pattern.c_str(), GLOB_TILDE, NULL, &glob_result); + if (return_value != 0) { + globfree(&glob_result); + stringstream ss; + ss << "glob() failed with return_value " << return_value << endl; + throw std::runtime_error(ss.str()); + } + + vector filenames; + for (size_t i = 0; i < glob_result.gl_pathc; ++i) + filenames.push_back(string(glob_result.gl_pathv[i])); + + // cleanup + globfree(&glob_result); + + // done + return filenames; +} + + +int main(int argc, char** argv) { + + if (argc != 5) { + cout << "run: ./sigs_to_bins " << endl; + exit(1); + } + string sigs_dir = argv[1]; + int kSize = stoi(argv[2]); + string output_dir = argv[3]; + int user_threads = stoi(argv[4]); + + string cmd = "mkdir -p " + output_dir; + + const int dir_err = system(cmd.c_str()); + if (-1 == dir_err) + { + printf("Error creating directory!n"); + exit(1); + } + + + // 1. Scan all sigs in a directory + vector sigs_paths; + vector sig_names; + + int skipped_files = 0; + + int total_sigs_number = 0; + for (const auto& dirEntry : glob2(sigs_dir + "/*")) { + string file_name = (string)dirEntry; + size_t lastindex = file_name.find_last_of("."); + string sig_prefix = file_name.substr(0, lastindex); + std::string sig_basename = sig_prefix.substr(sig_prefix.find_last_of("/\\") + 1); + std::string::size_type idx; + idx = file_name.rfind('.'); + std::string extension = ""; + if (idx != std::string::npos) extension = file_name.substr(idx + 1); + if (extension != "sig" && extension != "gz") continue; + + if (file_exists(output_dir + "/" + sig_basename + ".bin")) { skipped_files++; continue; } + + sig_names.push_back(sig_basename); + sigs_paths.push_back(file_name); + + total_sigs_number++; + } + + cout << "Skipped " << skipped_files << " files as they already converted to bins." << endl; + + int sigs_count = sigs_paths.size(); + auto begin_time = Time::now(); + +#pragma omp parallel num_threads(user_threads) + { +#pragma omp for + for (int j = 0; j < sigs_paths.size(); j++) { + string& sig_path = sigs_paths[j]; + string& sig_name = sig_names[j]; + phmap::flat_hash_set tmp_hashes; + + zstr::ifstream sig_stream(sig_path); + json::value json = json::parse(sig_stream); + auto sourmash_sig = json[0]["signatures"]; + const json::array& sig_array = as_array(sourmash_sig); + for (auto it = sig_array.begin(); it != sig_array.end(); ++it) { + const json::value& v = *it; + if (v["ksize"] == kSize) { + const json::array& mins = as_array(v["mins"]); + const json::array& abundances = as_array(v["abundances"]); + auto mins_it = mins.begin(); + while (mins_it != mins.end()) { + // const auto & abund = json::to_number(*abund_it); + tmp_hashes.insert(json::to_number(*mins_it)); + mins_it++; + } + } + break; + } + + string out_path = output_dir + "/" + sig_name + ".bin"; + phmap::BinaryOutputArchive ar_out(out_path.c_str()); + tmp_hashes.phmap_dump(ar_out); + } + } + + cout << endl; + cout << "Process completed in " << std::chrono::duration(Time::now() - begin_time).count() / 1000 << " secs" << endl; + +} \ No newline at end of file diff --git a/src/bins_indexing.cpp b/src/bins_indexing.cpp new file mode 100644 index 0000000..4c6473a --- /dev/null +++ b/src/bins_indexing.cpp @@ -0,0 +1,292 @@ +#include "kSpider.hpp" +#include +#include +#include +#include "colored_kDataFrame.hpp" +#include "parallel_hashmap/phmap.h" +#include "kDataFrame.hpp" +#include "algorithms.hpp" +#include +#include +#include +#include +#include +#include +#include "parallel_hashmap/phmap_dump.h" +#include + +typedef std::chrono::high_resolution_clock Time; + + +using BINS_MAP = phmap::parallel_flat_hash_map, + phmap::priv::hash_default_hash, + phmap::priv::hash_default_eq, + std::allocator>>, + 12, + std::mutex +>; +using LEGENDS_MAP = phmap::parallel_flat_hash_map, + std::hash, + std::equal_to, + std::allocator>>, + 4>; // 6 submaps because colors will grow + +using LEGENDS_MAP_OLD = phmap::parallel_flat_hash_map>; + + +// thanks to https://stackoverflow.com/a/8615450/3371177 +inline std::vector glob2(const std::string& pattern) { + using namespace std; + + // glob struct resides on the stack + glob_t glob_result; + memset(&glob_result, 0, sizeof(glob_result)); + + // do the glob operation + int return_value = glob(pattern.c_str(), GLOB_TILDE, NULL, &glob_result); + if (return_value != 0) { + globfree(&glob_result); + stringstream ss; + ss << "glob() failed with return_value " << return_value << endl; + throw std::runtime_error(ss.str()); + } + + // collect all the filenames into a std::list + vector filenames; + for (size_t i = 0; i < glob_result.gl_pathc; ++i) { + filenames.push_back(string(glob_result.gl_pathv[i])); + } + + // cleanup + globfree(&glob_result); + + // done + return filenames; +} + +namespace kSpider { + + void bins_indexing(string bins_dir, int selective_kSize, string output_prefix, uint64_t kmers_reserve, uint64_t colors_reserve) { + + kDataFrame* frame; + std::string dir_prefix = bins_dir.substr(bins_dir.find_last_of("/\\") + 1); + + flat_hash_map namesMap; + string names_fileName = bins_dir; + + flat_hash_map tagsMap; + flat_hash_map groupNameMap; + + auto* legend = new LEGENDS_MAP(); + legend->reserve(colors_reserve); + + flat_hash_map colorsCount; + uint64_t readID = 0, groupID = 1; + string seqName, groupName; + string line; + priority_queue, std::greater> freeColors; + flat_hash_map groupCounter; + int detected_kSize = 0; + + int total_bins_number = 0; + frame = new kDataFramePHMAP(selective_kSize, mumur_hasher); + frame->reserve(kmers_reserve); + + flat_hash_map basename_to_path; + + for (const auto& dirEntry : glob2(bins_dir + "/*")) { + string file_name = (string)dirEntry; + size_t lastindex = file_name.find_last_of("."); + string bin_prefix = file_name.substr(0, lastindex); + std::string bin_basename = bin_prefix.substr(bin_prefix.find_last_of("/\\") + 1); + + + std::string::size_type idx; + idx = file_name.rfind('.'); + std::string extension = ""; + if (idx != std::string::npos) extension = file_name.substr(idx + 1); + if (extension != "bin") { + cerr << "skipping " << file_name << " does not have extension .bin" << endl; + continue; + } + + basename_to_path.insert(pair(bin_basename, file_name)); + + total_bins_number++; + + seqName = bin_basename; + groupName = bin_basename; + + namesMap.insert(make_pair(seqName, groupName)); + auto it = groupNameMap.find(groupName); + groupCounter[groupName]++; + if (it == groupNameMap.end()) { + groupNameMap.insert(make_pair(groupName, groupID)); + tagsMap.insert(make_pair(to_string(groupID), groupID)); + vector tmp; + tmp.clear(); + tmp.push_back(groupID); + legend->insert(make_pair(groupID, tmp)); + colorsCount.insert(make_pair(groupID, 0)); + groupID++; + } + } + + cout << "namesmap construction done..." << endl; + + + // ---------------------------------------------------------------- + + + flat_hash_map inv_groupNameMap; + for (auto& _ : groupNameMap) + inv_groupNameMap[_.second] = _.first; + + + int currIndex = 0; + string kmer; + uint64_t tagBits = 0; + uint64_t maxTagValue = (1ULL << tagBits) - 1; + + uint64_t lastTag = 0; + readID = 0; + + int processed_bins_count = 0; + auto begin_time = Time::now(); + uint_fast64_t current_kmers_numbers = 0; + + // START + for (const auto& [bin_basename, bin_path] : basename_to_path) { + //START + + cout << "Processing " << ++processed_bins_count << "/" << total_bins_number << " | " << bin_basename << " ... " << endl; + + flat_hash_map convertMap; + + string readName = bin_basename; + string groupName = bin_basename; + + uint64_t readTag = groupNameMap.find(groupName)->second; + + + convertMap.clear(); + convertMap.insert(make_pair(0, readTag)); + convertMap.insert(make_pair(readTag, readTag)); + + begin_time = Time::now(); + phmap::flat_hash_set bin_hashes; + phmap::BinaryInputArchive ar_in(bin_path.c_str()); + bin_hashes.phmap_load(ar_in); + + for (const uint64_t& hashed_kmer : bin_hashes) { + uint64_t currentTag = frame->getCount(hashed_kmer); + auto itc = convertMap.find(currentTag); + if (itc == convertMap.end()) { + vector colors = legend->find(currentTag)->second; + auto tmpiT = find(colors.begin(), colors.end(), readTag); + if (tmpiT == colors.end()) { + colors.push_back(readTag); + sort(colors.begin(), colors.end()); + } + + string colorsString = to_string(colors[0]); + for (int k = 1; k < colors.size(); k++) { + colorsString += ";" + to_string(colors[k]); + } + + auto itTag = tagsMap.find(colorsString); + if (itTag == tagsMap.end()) { + uint64_t newColor; + if (freeColors.size() == 0) { + newColor = groupID++; + } + else { + newColor = freeColors.top(); + freeColors.pop(); + } + + tagsMap.insert(make_pair(colorsString, newColor)); + legend->insert(make_pair(newColor, colors)); + itTag = tagsMap.find(colorsString); + colorsCount[newColor] = 0; + } + uint64_t newColor = itTag->second; + + convertMap.insert(make_pair(currentTag, newColor)); + itc = convertMap.find(currentTag); + } + + if (itc->second != currentTag) { + + colorsCount[currentTag]--; + if (colorsCount[currentTag] == 0 && currentTag != 0) { + + auto _invGroupNameIT = inv_groupNameMap.find(currentTag); + if (_invGroupNameIT == inv_groupNameMap.end()) { + freeColors.push(currentTag); + vector colors = legend->find(currentTag)->second; + string colorsString = to_string(colors[0]); + for (int k = 1; k < colors.size(); k++) { + colorsString += ";" + to_string(colors[k]); + } + tagsMap.erase(colorsString); + legend->erase(currentTag); + if (convertMap.find(currentTag) != convertMap.end()) + convertMap.erase(currentTag); + } + + } + colorsCount[itc->second]++; + } + + frame->setCount(hashed_kmer, itc->second); + // no need now + // if (frame->getCount(hashed_kmer) != itc->second) { + // //frame->setC(kmer,itc->second); + // cout << "Error Founded " << hashed_kmer << " from sequence " << readName << " expected " + // << itc->second << " found " << frame->getCount(hashed_kmer) << endl; + // exit(1); + // } + } + readID += 1; + groupCounter[groupName]--; + if (colorsCount[readTag] == 0) { + if (groupCounter[groupName] == 0) { + freeColors.push(readTag); + legend->erase(readTag); + } + + } + auto loop_time_secs = std::chrono::duration(Time::now() - begin_time).count() / 1000; + cout << " loaded_kmers " << bin_hashes.size() << endl; + cout << " uniq_added_kmers: " << frame->size() - current_kmers_numbers << endl; + cout << " total_kmers " << frame->size() << " | load_factor: " << frame->load_factor() << endl; + cout << " total_colors " << legend->size() << " | load_factor: " << legend->load_factor() << endl; + cout << " loop_time: " << loop_time_secs << " secs" << endl; + cout << "--------" << endl; + current_kmers_numbers = frame->size(); + + // END + + } + + + colorTable* colors = new intVectorsTable(); + for (auto it : *legend) { + colors->setColor(it.first, it.second); + } + + colored_kDataFrame* res = new colored_kDataFrame(); + res->setColorTable(colors); + res->setkDataFrame(frame); + for (auto iit = namesMap.begin(); iit != namesMap.end(); iit++) { + uint32_t sampleID = groupNameMap[iit->second]; + res->namesMap[sampleID] = iit->second; + res->namesMapInv[iit->second] = sampleID; + } + cout << "saving to " << dir_prefix << " ..." << endl; + res->save(output_prefix); + } + +} \ No newline at end of file diff --git a/src/pairwise.cpp b/src/pairwise.cpp index 1a76ae5..47ea7e8 100644 --- a/src/pairwise.cpp +++ b/src/pairwise.cpp @@ -71,7 +71,7 @@ namespace kSpider { void pairwise(string index_prefix, int user_threads) { // Read colors - clock_t begin_time = clock(); + auto begin_time = Time::now(); string colors_map = index_prefix + "colors.intvectors"; ifstream input(colors_map.c_str()); int size; @@ -88,9 +88,9 @@ namespace kSpider { } } - cout << "mapping colors to groups: " << float(clock() - begin_time) / CLOCKS_PER_SEC << " secs" << endl; + cout << "mapping colors to groups: " << std::chrono::duration(Time::now() - begin_time).count() / 1000 << " secs" << endl; - begin_time = clock(); + begin_time = Time::now(); int_int_map colorsCount; auto* kf = kDataFrame::load(index_prefix); auto it = kf->begin(); @@ -117,8 +117,8 @@ namespace kSpider { // colorsCount.insert(make_pair(tmp[0], tmp[1])); // } - cout << "parsing index colors: " << float(clock() - begin_time) / CLOCKS_PER_SEC << " secs" << endl; - begin_time = clock(); + cout << "parsing index colors: " << std::chrono::duration(Time::now() - begin_time).count() / 1000 << " secs" << endl; + begin_time = Time::now(); flat_hash_map groupID_to_kmerCount; for (const auto& record : color_to_ids) { uint32_t colorCount = colorsCount[record.first]; @@ -135,9 +135,9 @@ namespace kSpider { fstream_kmerCount << ++counter << '\t' << item.first << '\t' << item.second << '\n'; } fstream_kmerCount.close(); - cout << "kmer counting: " << float(clock() - begin_time) / CLOCKS_PER_SEC << " secs" << endl; //time + cout << "kmer counting: " << std::chrono::duration(Time::now() - begin_time).count() / 1000 << " secs" << endl; - begin_time = clock(); + begin_time = Time::now(); clock_t begin_detailed_pairwise_comb, begin_detailed_pairwise_edges, begin_detailed_pairwise_edges_insertion; float detailed_pairwise_comb = 0.0; float detailed_pairwise_edges = 0.0; @@ -152,7 +152,7 @@ namespace kSpider { int n = vec_color_to_ids.size(); omp_set_num_threads(user_threads); - begin_time = clock(); + begin_time = Time::now(); #pragma omp parallel private(vec_i,thread_num,num_threads,start,end) { @@ -183,15 +183,17 @@ namespace kSpider { } } - cout << "pairwise hashmap construction: " << float(clock() - begin_time) / CLOCKS_PER_SEC << " secs" << endl; //time - cout << "writing pairwise matrix to" << index_prefix << "_kSpider_pairwise.tsv" << endl; + cout << "pairwise hashmap construction: " << std::chrono::duration(Time::now() - begin_time).count() / 1000 << " secs" << endl; + cout << "writing pairwise matrix to " << index_prefix << "_kSpider_pairwise.tsv" << endl; std::ofstream myfile; myfile.open(index_prefix + "_kSpider_pairwise.tsv"); - myfile << "ID" << '\t' << "seq1" << '\t' << "seq2" << '\t' << "shared_kmers" << '\n'; + myfile << "bin_1" << '\t' << "bin_2" << '\t' << "shared_kmers" << '\t' << "max_containment" << '\n'; uint64_t line_count = 0; - for (const auto& edge : edges) - myfile << ++line_count << '\t' << edge.first.first << '\t' << edge.first.second << '\t' << edge.second << '\n'; + for (const auto& edge : edges) { + float max_containment = (float)edge.second / min(groupID_to_kmerCount[edge.first.first], groupID_to_kmerCount[edge.first.second]); + myfile << edge.first.first << '\t' << edge.first.second << '\t' << edge.second << '\t' << max_containment << '\n'; + } myfile.close(); } diff --git a/src/sourmash_indexing.cpp b/src/sourmash_indexing.cpp index ba15676..89e7531 100644 --- a/src/sourmash_indexing.cpp +++ b/src/sourmash_indexing.cpp @@ -11,46 +11,9 @@ #include #include #include -#include "RSJparser.tcc" #include - -using JSON = RSJresource; - -// thanks to http://jsteemann.github.io/blog/2016/06/02/fastest-string-to-uint64-conversion-method/ -inline uint64_t unrolled(std::string const& value) { - uint64_t result = 0; - - size_t const length = value.size(); - switch (length) { - case 20: result += (value[length - 20] - '0') * 10000000000000000000ULL; - case 19: result += (value[length - 19] - '0') * 1000000000000000000ULL; - case 18: result += (value[length - 18] - '0') * 100000000000000000ULL; - case 17: result += (value[length - 17] - '0') * 10000000000000000ULL; - case 16: result += (value[length - 16] - '0') * 1000000000000000ULL; - case 15: result += (value[length - 15] - '0') * 100000000000000ULL; - case 14: result += (value[length - 14] - '0') * 10000000000000ULL; - case 13: result += (value[length - 13] - '0') * 1000000000000ULL; - case 12: result += (value[length - 12] - '0') * 100000000000ULL; - case 11: result += (value[length - 11] - '0') * 10000000000ULL; - case 10: result += (value[length - 10] - '0') * 1000000000ULL; - case 9: result += (value[length - 9] - '0') * 100000000ULL; - case 8: result += (value[length - 8] - '0') * 10000000ULL; - case 7: result += (value[length - 7] - '0') * 1000000ULL; - case 6: result += (value[length - 6] - '0') * 100000ULL; - case 5: result += (value[length - 5] - '0') * 10000ULL; - case 4: result += (value[length - 4] - '0') * 1000ULL; - case 3: result += (value[length - 3] - '0') * 100ULL; - case 2: result += (value[length - 2] - '0') * 10ULL; - case 1: result += (value[length - 1] - '0'); - } - return result; -} - -template<> -uint64_t RSJresource::as(const uint64_t& def) { - if (!exists()) return (0); // required - return (unrolled(data)); // example -} +#include "cpp-json/json.h" +#include "zstr.hpp" // thanks to https://stackoverflow.com/a/8615450/3371177 inline std::vector glob2(const std::string& pattern) { @@ -175,29 +138,29 @@ namespace kSpider { if (idx != std::string::npos) extension = file_name.substr(idx + 1); if (extension != "sig") continue; - std::ifstream sig_stream(file_name); - JSON sig(sig_stream); - int number_of_sub_sigs = sig[0]["signatures"].size(); - string general_name = sig[0]["name"].as(); - if (general_name == "") { - std::string sig_basename = sig_prefix.substr(sig_prefix.find_last_of("/\\") + 1); - general_name = sig_basename; - } + zstr::ifstream sig_stream(file_name); + json::value json = json::parse(sig_stream); + + + auto sourmash_sig = json[0]["signatures"]; + const json::array& sig_array = as_array(sourmash_sig); //START - for (int i = 0; i < number_of_sub_sigs; i++) { - int current_kSize = sig[0]["signatures"][i]["ksize"].as(); - if (current_kSize != selective_kSize) continue; + for (auto it = sig_array.begin(); it != sig_array.end(); ++it) { + + const json::value& v = *it; + if (v["ksize"] != selective_kSize) { + continue; + } + + cout << "Processing " << ++processed_sigs_count << "/" << total_sigs_number << " | " << sig_basename << " k:" << selective_kSize << " ... " << endl; - cout << "Processing " << ++processed_sigs_count << "/" << total_sigs_number << " | " << general_name << " k:" << selective_kSize << " ... " << endl; - string md5sum = sig[0]["signatures"][i]["md5sum"].as(); - string sig_name = md5sum + ":" + general_name; flat_hash_map convertMap; - string readName = sig_name; - string groupName = general_name; + string readName = sig_basename; + string groupName = sig_basename; uint64_t readTag = groupNameMap.find(groupName)->second; @@ -206,10 +169,11 @@ namespace kSpider { convertMap.insert(make_pair(0, readTag)); convertMap.insert(make_pair(readTag, readTag)); + const json::array& mins = as_array(v["mins"]); + auto loaded_sig_it = mins.begin(); - auto loaded_sig_it = sig[0]["signatures"][i]["mins"].as_array().begin(); - while (loaded_sig_it != sig[0]["signatures"][i]["mins"].as_array().end()) { - uint64_t hashed_kmer = loaded_sig_it->as(); + while (loaded_sig_it != mins.end()) { + uint64_t hashed_kmer = json::to_number(*loaded_sig_it); uint64_t currentTag = frame->getCount(hashed_kmer); auto itc = convertMap.find(currentTag); if (itc == convertMap.end()) { @@ -289,6 +253,9 @@ namespace kSpider { } cout << " saved_kmers(~" << frame->size() << ")." << endl << endl; + cout << " colors(~" << legend->size() << ")." << endl << endl; + + break; } // END diff --git a/validate.cpp b/validate.cpp new file mode 100644 index 0000000..3c42265 --- /dev/null +++ b/validate.cpp @@ -0,0 +1,66 @@ +#include +#include +#include +#include "parallel_hashmap/phmap.h" +#include +#include +#include "cpp-json/json.h" +#include "zstr.hpp" +#include +#include +#include +#include "parallel_hashmap/phmap_dump.h" +#include + +using namespace std; +// using namespace phmap; + +typedef std::chrono::high_resolution_clock Time; + + +int main(int argc, char** argv) { + + if (argc != 4) { + cout << "run: ./validate " << endl; + exit(1); + } + + string sig_path = argv[1]; + int kSize = stoi(argv[2]); + string bin_path = argv[3]; + + phmap::flat_hash_set tmp_hashes; + + auto begin_time = Time::now(); + zstr::ifstream sig_stream(sig_path); + json::value json = json::parse(sig_stream); + auto sourmash_sig = json[0]["signatures"]; + const json::array& sig_array = as_array(sourmash_sig); + for (auto it = sig_array.begin(); it != sig_array.end(); ++it) { + const json::value& v = *it; + if (v["ksize"] == kSize) { + const json::array& mins = as_array(v["mins"]); + auto mins_it = mins.begin(); + while (mins_it != mins.end()) { + tmp_hashes.insert(json::to_number(*mins_it)); + mins_it++; + } + } + break; + } + + cerr << "loading ..." << endl; + phmap::flat_hash_set table_in; + phmap::BinaryInputArchive ar_in(bin_path.c_str()); + table_in.phmap_load(ar_in); + cerr << "loading done..." << endl; + + + uint64_t shared_hashes = count_if(table_in.begin(), table_in.end(), [&](uint64_t k) {return tmp_hashes.find(k) != tmp_hashes.end();}); + + cout << "loaded bin size: " << table_in.size() << endl; + cout << "loaded sig size: " << tmp_hashes.size() << endl; + cout << "shared hashes: " << shared_hashes << endl; + + +} \ No newline at end of file diff --git a/validate_bins.sh b/validate_bins.sh new file mode 100644 index 0000000..1a47df1 --- /dev/null +++ b/validate_bins.sh @@ -0,0 +1,20 @@ +CHECKER=$1 +bins_dir=$2 +REPORT=${bins_dir}_validate_report.txt +rm -rf ${REPORT} +touch ${REPORT} +# CHECKER=/home/mabuelanin/dib-dev/kSpider_bins/build/check_bin +no_bins=$(find ${bins_dir} -printf \\n | wc -l) +COUNTER=1 +for bin in ${bins_dir}/*.bin; +do + let COUNTER++; + echo "${COUNTER}/${no_bins}"; + result=$(${CHECKER} ${bin} 2>&1); + if [[ "${result}" == *"VALID_BIN"* ]]; + then + echo -e "${bin} | ${result}" >> ${REPORT} + else + echo "${bin} | INVALID" >> ${REPORT} + fi +done \ No newline at end of file