close #232

gagolews · Feb 8, 2019 · 1fcf565 · 1fcf565
1 parent d77b6f1
commit 1fcf565
Show file tree

Hide file tree

Showing 27 changed files with 338 additions and 188 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -9,6 +9,7 @@
 ^src-x64
 ^src/.*\.o$
 ^src/icu55/data
+^src/icu61/data/icudt61b.zip
 ^src/.*\.a$
 ^src/.*\.so$
 ^src/.*\.dll$

diff --git a/.gitignore b/.gitignore
@@ -48,3 +48,5 @@ devel/benchmarks/figure
 devel/benchmarks/report-*.md
 devel/benchmarks/report-*.html
 devel/benchmarks/test1.csv.gz
+
+.vscode
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: stringi
 Version: 1.3.1
-Date: 2019-02-06
+Date: 2019-02-08
 Title: Character String Processing Facilities
 Description: Fast, correct, consistent, portable,
     as well as convenient character string/text processing in every locale

diff --git a/INSTALL b/INSTALL
@@ -19,15 +19,14 @@ customized installation.
 stringi vs the ICU4C library
 ----------------------------
 
-The stringi package depends on the ICU4C >= 52 library
-with `U_CHARSET_IS_UTF8` flag disabled.
+The stringi package depends on the ICU4C >= 52 library.
 
 If you build the package from sources and either:
-* these requirements are not met (check out http://site.icu-project.org/download,
+* this requirement is not met (check out http://site.icu-project.org/download,
    the `libicu-devel` rpm on Fedora/CentOS/OpenSUSE,
-   or `libicu-dev` on Ubuntu/Debian etc.),
-* `pkg-config` is not able to find proper build settings for ICU-based projects,
-or
+   or `libicu-dev` on Ubuntu/Debian, etc.),
+* `pkg-config` is not able to find appropriate build settings
+   for ICU-based projects, or
 * `R CMD INSTALL` is called with the `--configure-args='--disable-pkg-config'`
 argument or `install.packages("stringi", configure.args="--disable-pkg-config")`
 is executed,
@@ -54,7 +53,7 @@ of the package. It already includes the `ICU` data archives.
     unzip stringi-master.zip
     R CMD INSTALL stringi-master
 
-You may also prepare your own `.tar.gz`-ipped distribution of `stringi`
+You can also prepare your own `.tar.gz`-ipped distribution of `stringi`
 as follows:
 
 1. Execute the `git clone https://github.com/gagolews/stringi.git` command.
@@ -69,10 +68,11 @@ C++11 support
 -------------
 
 For R >= 3.1.0  we suggest (by default) C++11 support to build the package
-from sources. This is because ICU4C uses the `long long` type in a few
-functions, and this is not part of the C++98 standard.
+from sources. This is because older releases of ICU4C use the `long long`
+type in a few functions, and this is not part of the C++98 standard. Moreover,
+it has become required by newer versions of ICU4C.
 
-Yet, if your compiler does not support C++11 or it has not been properly
+However, if your compiler does not support C++11 or it has not been properly
 configured (check out `<R_inst_dir>/etc/Makeconf`) but you are sure it
 tolerates the `long long` type (which is very common -- this is checked by the
 `configure` script anyway), you may disable the use of C++11 by passing
@@ -131,6 +131,6 @@ have a working internet access, all is expected to lead to a happy ending.
 
 If you do not manage to set up a successful stringi build, do not
 hesitate to [file a bug report](https://github.com/gagolews/stringi/issues).
-However, please check the already-closed issues for similar problems
-experienced by other users - it is very likely they have already been
-succesfully resolved.
+However, please check the archived (closed) issues for similar problems
+experienced by other users -- it is very likely they have already been
+successfully resolved.
diff --git a/NEWS b/NEWS
@@ -13,10 +13,13 @@ the default (UTF-8) encoding cannot be changed.
 
 * TODO UBSAN..............................
 
+* [NEW FEATURE] #232: all `stri_detect_*` functions now have the `max_count`
+argument that allows for, e.g., stopping at first pattern occurrence.
+
 * [NEW FEATURE] #338: `stri_sub_replace` is now an alias for `stri_sub<-`
 (@yutannihilation, @BastienFR), which makes it much more easily pipable.
 
-* [NEW FEATURE] #334: Added missing icudt61b.dat to support big-endian
+* [NEW FEATURE] #334: Added missing `icudt61b.dat` to support big-endian
 platforms. Thanks to Dimitri John Ledkov (@xnox).
 
 * [BUGFIX] #319: Fixed overflow in `stri_rand_shuffle()`.
@@ -30,7 +33,7 @@ on empty search patters.
 
 * [BUGFIX] #314: test `U_CHARSET_IS_UTF8` in configure when using pkg-build.
 
-* [BUGFIX] #317: include icudt61l.zip in the source bundle to solve the
+* [BUGFIX] #317: include `icudt61l.zip` in the source bundle to solve the
 frequent `icudt download failed` error (also on CRAN's windows-release
 and windows-oldrel).
 
@@ -39,25 +42,26 @@ and windows-oldrel).
 
 * [BUGFIX] #296: Fixed the behavior of the `./configure` script on CentOS 6.
 
-* [BUGFIX] Fixed broken Windows build by updating the icudt mirror list.
+* [BUGFIX] Fixed broken Windows build by updating the `icudt` mirror list.
 
 
 ## 1.2.2 (2018-05-01) **CRAN**
 
-* [GENERAL] #193: `stringi` is now bundled with ICU4C 61.1,
+* [GENERAL] #193: stringi is now bundled with ICU4C 61.1,
 which is used on most Windows and OS X builds as well as on *nix systems
 not equipped with ICU. However, if the C++11 support is disabled,
 stringi will be built against ICU4C 55.1. The update to ICU brings
 Unicode 10.0 support, including new emoji characters.
 
-* [BUGFIX] #288: stri_match did not return the correct number of columns
+* [BUGFIX] #288: `stri_match()` did not return the correct number of columns
 when input was empty.
 
-* [NEW FEATURE] #188: `stri_enc_detect` now returns a list of data frames.
+* [NEW FEATURE] #188: `stri_enc_detect()` now returns a list of data frames.
 
-* [NEW FEATURE] #289: `stri_flatten` gained `na_empty` `omit_empty` arguments.
+* [NEW FEATURE] #289: `stri_flatten()` how has
+`na_empty` `omit_empty` arguments.
 
-* [NEW FEATURE] New functions: `stri_remove_empty`, `stri_na2empty`
+* [NEW FEATURE] New functions: `stri_remove_empty()`, `stri_na2empty()`
 
 * [NEW FEATURE] #285: Coercion from a non-trivial list (one that consists
 of atomic vectors, each of length 1) to an atomic vector now issues a warning.
@@ -85,11 +89,11 @@ This fixes problems with - among others - displaying the Euro sign.
 * [NEW FEATURE] #263: Add support for custom rule-based break iteration,
 see `?stri_opts_brkiter`.
 
-* [NEW FEATURE] #267: `omit_na=TRUE` in `stri_sub<-` now ignores missing values
+* [NEW FEATURE] #267: `omit_na=TRUE` in `stri_sub<-()` now ignores missing values
 in any of the arguments provided.
 
 * [BUGFIX] fixed unPROTECTed variable names and stack imbalances
-as reported by rchk
+as reported by `rchk`.
 
 -------------------------------------------------------------------------------
 
@@ -128,9 +132,9 @@ following environment variables: `STRINGI_CFLAGS`, `STRINGI_CPPFLAGS`,
 `STRINGI_DISABLE_ICU_BUNDLE`, `STRINGI_DISABLE_PKG_CONFIG`, `PKG_CONFIG`,
 see `INSTALL` for more information.
 
-* [BUILD TIME] #253: call to `R_useDynamicSymbols` added.
+* [BUILD TIME] #253: call to `R_useDynamicSymbols()` added.
 
-* [BUILD TIME] #230: icudt is now being downloaded by
+* [BUILD TIME] #230: `icudt` is now being downloaded by
 `./configure` (*NIX only) *before* building.
 
 * [BUILD TIME] #242: `_COUNT/_LIMIT` enum constants have been deprecated
@@ -140,7 +144,7 @@ as of ICU 58.2, stringi code has been upgraded accordingly.
 
 ## 1.1.2 (2016-09-30) **CRAN**
 
-* [BUGFIX] round(), snprintf() is not C++98
+* [BUGFIX] `round()`, `snprintf()` is not C++98.
 
 -------------------------------------------------------------------------------
 
@@ -151,20 +155,20 @@ as of ICU 58.2, stringi code has been upgraded accordingly.
 * [BUGFIX] #210: `stri_replace_all_fixed(c("1", "NULL"), "NULL", NA)`
 now results in `c("1", NA)`.
 
-* [NEW FEATURE] #199: `stri_sub<-` now allows for ignoring `NA` locations
+* [NEW FEATURE] #199: `stri_sub<-()` now allows for ignoring `NA` locations
 (a new `omit_na` argument added).
 
-* [NEW FEATURE] #207: `stri_sub<-` now allows for substring insertions
+* [NEW FEATURE] #207: `stri_sub<-()` now allows for substring insertions
 (via `length=0`).
 
-* [NEW FUNCTION] #124: `stri_subset<-` functions added.
+* [NEW FUNCTION] #124: `stri_subset<-()` functions added.
 
-* [NEW FEATURE] #216: `stri_detect`, `stri_subset`, `stri_subset<-` gained
-a `negate` argument.
+* [NEW FEATURE] #216: `stri_detect()`, `stri_subset()`, `stri_subset<-()`
+now all have the `negate` argument.
 
-* [NEW FUNCTION] #175: `stri_join_list` concatenates all strings
-in a list of character vectors. Useful with, e.g., `stri_extract_all_regex`,
-`stri_extract_all_words` etc.
+* [NEW FUNCTION] #175: `stri_join_list()` concatenates all strings
+in a list of character vectors. Useful with, e.g., `stri_extract_all_regex()`,
+`stri_extract_all_words()`, etc.
 
 -------------------------------------------------------------------------------
 
@@ -224,7 +228,7 @@ boundaries.
 `chartr()` equivalent.
 
 * [NEW FUNCTION] #8: `stri_width()` approximates the *width* of a string
-in a more Unicodish fashion than `nchar(..., "width")`
+in a more Unicode-ish fashion than `nchar(..., "width")`
 
 * [NEW FEATURE] #149: `stri_pad()` and `stri_wrap()` now by default bases on
 code point widths instead of the number of code points. Moreover, the default

diff --git a/R/search_count_4.R b/R/search_count_4.R
@@ -1,5 +1,5 @@
 ## This file is part of the 'stringi' package for R.
-## Copyright (c) 2013-2017, Marek Gagolewski and other contributors.
+## Copyright (c) 2013-2019, Marek Gagolewski and other contributors.
 ## All rights reserved.
 ##
 ## Redistribution and use in source and binary forms, with or without
@@ -45,8 +45,7 @@
 #' \code{stri_count} is a convenience function.
 #' It calls either \code{stri_count_regex},
 #' \code{stri_count_fixed}, \code{stri_count_coll},
-#' or \code{stri_count_charclass}, depending on the argument used; relying
-#' on one of those underlying functions will be faster.
+#' or \code{stri_count_charclass}, depending on the argument used.
 #'
 #' @param str character vector with strings to search in
 #' @param pattern,regex,fixed,coll,charclass character vector defining search patterns;

diff --git a/R/search_count_bound.R b/R/search_count_bound.R
@@ -1,5 +1,5 @@
 ## This file is part of the 'stringi' package for R.
-## Copyright (c) 2013-2017, Marek Gagolewski and other contributors.
+## Copyright (c) 2013-2019, Marek Gagolewski and other contributors.
 ## All rights reserved.
 ##
 ## Redistribution and use in source and binary forms, with or without
@@ -33,7 +33,7 @@
 #' Count the Number of Text Boundaries
 #'
 #' @description
-#' This function determines the number of specific text boundaries
+#' These functions determine the number of specific text boundaries
 #' (like character, word, line, or sentence boundaries) in a string.
 #'
 #' @details
@@ -50,15 +50,14 @@
 #' to locate word boundaries, and all non-word characters
 #' (\code{UBRK_WORD_NONE} rule status) are ignored.
 #' This is function is equivalent to a call to
-#' \code{\link{stri_count_boundaries}(str, type="word", skip_word_none=TRUE, locale=locale)}
+#' \code{\link{stri_count_boundaries}(str, type="word", skip_word_none=TRUE, locale=locale)}.
 #'
 #' Note that a \code{BreakIterator} of type \code{character}
 #' may be used to count the number of \emph{Unicode characters} in a string.
-#' This may lead to different results than that returned by the
-#' \code{\link{stri_length}} function, which is designed to return
-#' the number of \emph{Unicode code points}.
+#' This may report different results than \code{\link{stri_length}},
+#' which aims to count the number of \emph{Unicode code points}.
 #'
-#' On the other hand, a \code{BreakIterator} of type \code{sentence}
+#' Moreover, a \code{BreakIterator} of type \code{sentence}
 #' may be used to count the number of sentences in a piece of text.
 #'
 #'
@@ -75,7 +74,7 @@
 #' Both functions return an integer vector.
 #'
 #' @examples
-#' test <- "The\u00a0above-mentioned    features are very useful. Warm thanks to their developers."
+#' test <- "The\u00a0above-mentioned    features are very useful. Kudos to their developers."
 #' stri_count_boundaries(test, type="word")
 #' stri_count_boundaries(test, type="sentence")
 #' stri_count_boundaries(test, type="character")

diff --git a/R/search_detect_4.R b/R/search_detect_4.R
@@ -1,5 +1,5 @@
 ## This file is part of the 'stringi' package for R.
-## Copyright (c) 2013-2017, Marek Gagolewski and other contributors.
+## Copyright (c) 2013-2019, Marek Gagolewski and other contributors.
 ## All rights reserved.
 ##
 ## Redistribution and use in source and binary forms, with or without
@@ -46,19 +46,26 @@
 #' It calls either \code{stri_detect_regex},
 #' \code{stri_detect_fixed}, \code{stri_detect_coll},
 #' or \code{stri_detect_charclass}, depending on the argument used.
-#' Relying on these underlying functions will make your code run slightly
-#' faster.
 #'
 #' See also \code{\link{stri_startswith}} and \code{\link{stri_endswith}}
-#' for testing whether a string starts or ends with a given pattern
-#' match, respectively. Moreover, see \code{\link{stri_subset}}
-#' for a character vector subsetting.
+#' for testing whether a string starts or ends with a match to a given pattern.
+#' Moreover, see \code{\link{stri_subset}} for a character vector subsetting.
+#'
+#' If \code{max_count} is negative, then stings are examined
+#' search for a given pattern. Otherwise, the search stops
+#' once \code{max_count} matches (or, if \code{negate} is \code{TRUE},
+#' no-matches) are detected. The uninspected cases are marked
+#' as missing in the return vector. Be aware that, unless \code{pattern} is a
+#' singleton, the elements in \code{str} might be inspected in a
+#' non-consecutive order.
 #'
 #'
 #' @param str character vector with strings to search in
-#' @param pattern,regex,fixed,coll,charclass character vector defining search patterns;
-#' for more details refer to \link{stringi-search}
+#' @param pattern,regex,fixed,coll,charclass character vector
+#' defining search patterns; for more details refer to \link{stringi-search}
 #' @param negate single logical value; whether a no-match is rather of interest
+#' @param max_count single integer; stops searching once a given
+#' number of occurrences is detected; \code{-1} (the default) inspects all
 #' @param opts_collator,opts_fixed,opts_regex a named list used to tune up
 #' a search engine's settings; see
 #' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
@@ -71,18 +78,25 @@
 #' @return Each function returns a logical vector.
 #'
 #' @examples
-#' stri_detect_fixed(c("stringi R", "REXAMINE", "123"), c('i', 'R', '0'))
-#' stri_detect_fixed(c("stringi R", "REXAMINE", "123"), 'R')
+#' stri_detect_fixed(c("stringi R", "R STRINGI", "123"), c('i', 'R', '0'))
+#' stri_detect_fixed(c("stringi R", "R STRINGI", "123"), 'R')
 #'
-#' stri_detect_charclass(c("stRRRingi","REXAMINE", "123"),
+#' stri_detect_charclass(c("stRRRingi","R STRINGI", "123"),
 #'    c("\\p{Ll}", "\\p{Lu}", "\\p{Zs}"))
 #'
-#' stri_detect_regex(c("stringi R", "REXAMINE", "123"), 'R.')
-#' stri_detect_regex(c("stringi R", "REXAMINE", "123"), '[[:alpha:]]*?')
-#' stri_detect_regex(c("stringi R", "REXAMINE", "123"), '[a-zC1]')
-#' stri_detect_regex(c("stringi R", "REXAMINE", "123"), '( R|RE)')
+#' stri_detect_regex(c("stringi R", "R STRINGI", "123"), 'R.')
+#' stri_detect_regex(c("stringi R", "R STRINGI", "123"), '[[:alpha:]]*?')
+#' stri_detect_regex(c("stringi R", "R STRINGI", "123"), '[a-zC1]')
+#' stri_detect_regex(c("stringi R", "R STRINGI", "123"), '( R|RE)')
 #' stri_detect_regex("stringi", "STRING.", case_insensitive=TRUE)
 #'
+#' stri_detect_regex(c("abc", "def", "123", "ghi", "456", "789", "jkl"),
+#'    "^[0-9]+$", max_count=1)
+#' stri_detect_regex(c("abc", "def", "123", "ghi", "456", "789", "jkl"),
+#'    "^[0-9]+$", max_count=2)
+#' stri_detect_regex(c("abc", "def", "123", "ghi", "456", "789", "jkl"),
+#'    "^[0-9]+$", negate=TRUE, max_count=3)
+#'
 #' @family search_detect
 #' @export
 #' @rdname stri_detect
@@ -91,7 +105,7 @@ stri_detect <- function(str, ..., regex, fixed, coll, charclass) {
                     "coll" =!missing(coll),  "charclass"=!missing(charclass))
 
    if (sum(providedarg) != 1)
-      stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
+      stop("you have to specify one of: `regex`, `fixed`, `coll`, or `charclass`")
 
    if (providedarg["regex"])
       stri_detect_regex(str, regex, ...)
@@ -105,32 +119,39 @@ stri_detect <- function(str, ..., regex, fixed, coll, charclass) {
 
 #' @export
 #' @rdname stri_detect
-stri_detect_fixed <- function(str, pattern, negate=FALSE, ..., opts_fixed=NULL) {
+stri_detect_fixed <- function(str, pattern, negate=FALSE, max_count=-1,
+        ..., opts_fixed=NULL)
+{
    if (!missing(...))
        opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
-   .Call(C_stri_detect_fixed, str, pattern, negate, opts_fixed)
+   .Call(C_stri_detect_fixed, str, pattern, negate, max_count, opts_fixed)
 }
 
 #' @export
 #' @rdname stri_detect
-stri_detect_charclass <- function(str, pattern, negate=FALSE) {
-   .Call(C_stri_detect_charclass, str, pattern, negate)
+stri_detect_charclass <- function(str, pattern, negate=FALSE, max_count=-1)
+{
+   .Call(C_stri_detect_charclass, str, pattern, negate, max_count)
 }
 
 
 #' @export
 #' @rdname stri_detect
-stri_detect_coll <- function(str, pattern, negate=FALSE, ..., opts_collator=NULL) {
+stri_detect_coll <- function(str, pattern, negate=FALSE, max_count=-1,
+        ..., opts_collator=NULL)
+{
    if (!missing(...))
        opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
-   .Call(C_stri_detect_coll, str, pattern, negate, opts_collator)
+   .Call(C_stri_detect_coll, str, pattern, negate, max_count, opts_collator)
 }
 
 
 #' @export
 #' @rdname stri_detect
-stri_detect_regex <- function(str, pattern, negate=FALSE, ..., opts_regex=NULL) {
+stri_detect_regex <- function(str, pattern, negate=FALSE, max_count=-1,
+        ..., opts_regex=NULL)
+{
    if (!missing(...))
        opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
-   .Call(C_stri_detect_regex, str, pattern, negate, opts_regex)
+   .Call(C_stri_detect_regex, str, pattern, negate, max_count, opts_regex)
 }