[2024.6]

psychbruce · Jun 13, 2024 · 556010b · 556010b
1 parent 18cc226
commit 556010b
Show file tree

Hide file tree

Showing 11 changed files with 119 additions and 114 deletions.
diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
@@ -1,3 +1,3 @@
-Version: 2024.5
-Date: 2024-05-19 05:24:12 UTC
-SHA: e9adda42c47daffb54fbab63ec47cd2b185be9e8
+Version: 2024.6
+Date: 2024-06-12 14:38:07 UTC
+SHA: 18cc22695b8c596044f4d2e85f6373ba37012258
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: FMAT
 Title: The Fill-Mask Association Test
-Version: 2024.5
-Date: 2024-05-15
+Version: 2024.6
+Date: 2024-06-12
 Authors@R:
     c(person(given = "Han-Wu-Shuang",
              family = "Bao",
@@ -27,9 +27,8 @@ BugReports: https://github.com/psychbruce/FMAT/issues
 SystemRequirements: Python (>= 3.9.0)
 Depends: R (>= 4.0.0)
 Imports:
-    PsychWordVec, reticulate,
-    data.table, stringr, forcats, psych, irr,
-    glue, cli, purrr, plyr, dplyr, tidyr
-Suggests: bruceR, text, nlme
+    reticulate, data.table, stringr, forcats, psych, irr,
+    glue, crayon, cli, purrr, plyr, dplyr, tidyr
+Suggests: bruceR, PsychWordVec, text, sweater, nlme
 RoxygenNote: 7.3.1
 Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,10 +11,13 @@ export(FMAT_query_bind)
 export(FMAT_run)
 export(ICC_models)
 export(LPR_reliability)
-export(cc)
 import(data.table)
 import(stringr)
-importFrom(PsychWordVec,cc)
+importFrom(crayon,blue)
+importFrom(crayon,green)
+importFrom(crayon,italic)
+importFrom(crayon,magenta)
+importFrom(crayon,underline)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(forcats,as_factor)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 **Please check the [latest news (change log)](https://psychbruce.github.io/FMAT/news/index.html) and keep this package updated.**
 
+# FMAT 2024.6
+
+-   Fixed bugs: Now only `BERT_download()` connects to the Internet, while all the other functions run in an offline way.
+-   Improved installation guidance for Python packages.
+
 # FMAT 2024.5
 
 -   Added `BERT_info()`.

diff --git a/R/FMAT.R b/R/FMAT.R
@@ -6,6 +6,7 @@
 #' @importFrom dplyr left_join mutate
 #' @importFrom forcats as_factor
 #' @importFrom stats na.omit
+#' @importFrom crayon italic underline green blue magenta
 .onAttach = function(libname, pkgname) {
   inst.ver = as.character(utils::packageVersion("FMAT"))
   pkg.date = substr(utils::packageDate("FMAT"), 1, 4)
@@ -45,9 +46,9 @@
 #### Utils ####
 
 
-#' @importFrom PsychWordVec cc
-#' @export
-PsychWordVec::cc
+## #' @importFrom PsychWordVec cc
+## #' @export
+## PsychWordVec::cc
 
 
 #' A simple function equivalent to `list`.
@@ -57,8 +58,7 @@ PsychWordVec::cc
 #' @return A list of named objects.
 #'
 #' @examples
-#' .(Male=cc("he, his"), Female=cc("she, her"))
-#' list(Male=cc("he, his"), Female=cc("she, her"))  # the same
+#' .(Male=c("he", "his"), Female=c("she", "her"))
 #'
 #' @export
 . = function(...) list(...)
@@ -113,12 +113,14 @@ transformers_init = function(print.info=TRUE) {
       gpu.info = paste("GPU (Device):", paste(torch$cuda$get_device_name(), collapse=", "))
     } else {
       cuda.ver = "NULL"
-      gpu.info = paste("(To use GPU, install PyTorch with CUDA support,",
-                       "see https://pytorch.org/get-started)")
+      gpu.info = "To use GPU, see https://psychbruce.github.io/FMAT/#guidance-for-gpu-acceleration"
     }
 
     transformers = reticulate::import("transformers")
     tf.ver = transformers$`__version__`
+
+    hfh.ver = reticulate::import("huggingface_hub")$`__version__`
+    url.ver = reticulate::import("urllib3")$`__version__`
   })
   if(print.info) {
     cli::cli_alert_info(cli::col_blue("Device Info:
@@ -130,6 +132,8 @@ transformers_init = function(print.info=TRUE) {
     Python Packages:
     transformers  {tf.ver}
     torch         {torch.ver}
+    urllib3       {url.ver}
+    huggingface-hub  {hfh.ver}
 
     NVIDIA GPU CUDA Support:
     CUDA Enabled: {torch.cuda}
@@ -142,8 +146,10 @@ transformers_init = function(print.info=TRUE) {
 
 
 fill_mask_init = function(transformers, model, device=-1L) {
-  config = transformers$AutoConfig$from_pretrained(model, local_files_only=TRUE)
-  fill_mask = transformers$pipeline("fill-mask", model=model, config=config,
+  cache.folder = get_cache_folder(transformers)
+  model.local = get_cached_model_path(cache.folder, model)
+  config = transformers$AutoConfig$from_pretrained(model.local, local_files_only=TRUE)
+  fill_mask = transformers$pipeline("fill-mask", model=model.local, config=config,
                                     model_kwargs=list(local_files_only=TRUE),
                                     device=device)
   return(fill_mask)
@@ -207,22 +213,36 @@ add_tokens = function(
 }
 
 
-find_cached_models = function(cache.folder) {
+get_cache_folder = function(transformers) {
+  str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
+}
+
+
+get_cached_models = function(cache.folder) {
   models.name = list.files(cache.folder, "^models--")
   if(length(models.name) > 0) {
-    models.size = sapply(paste0(cache.folder, "/", models.name), function(folder) {
+    models.info = do.call("rbind", lapply(paste0(cache.folder, "/", models.name), function(folder) {
       models.file = list.files(folder, pattern="(model.safetensors$|pytorch_model.bin$|tf_model.h5$)", recursive=TRUE, full.names=TRUE)
-      paste(paste0(sprintf("%.0f", file.size(models.file) / 1024^2), " MB"), collapse=" / ")
-    })
+      size = paste(paste0(sprintf("%.0f", file.size(models.file) / 1024^2), " MB"), collapse=" / ")
+      download.date = paste(str_remove(file.mtime(models.file), " .*"), collapse=" / ")
+      return(data.frame(size, download.date))
+    }))
     models.name = str_replace_all(str_remove(models.name, "^models--"), "--", "/")
-    models.info = data.frame(size=models.size, row.names=models.name)
+    row.names(models.info) = models.name
   } else {
     models.info = NULL
   }
   return(models.info)
 }
 
 
+get_cached_model_path = function(cache.folder, model) {
+  model.folder = paste0(cache.folder, "/models--", str_replace_all(model, "/", "--"))
+  model.path = list.files(model.folder, pattern="(model.safetensors$|pytorch_model.bin$|tf_model.h5$)", recursive=TRUE, full.names=TRUE)[1]
+  return(dirname(model.path))
+}
+
+
 #### BERT ####
 
 
@@ -269,9 +289,9 @@ BERT_download = function(models=NULL) {
       gc()
     })
   }
-  cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
+  cache.folder = get_cache_folder(transformers)
   cache.sizegb = sum(file.size(list.files(cache.folder, recursive=TRUE, full.names=TRUE))) / 1024^3
-  local.models = find_cached_models(cache.folder)
+  local.models = get_cached_models(cache.folder)
   cli::cli_h2("Downloaded models:")
   print(local.models)
   cat("\n")
@@ -305,8 +325,8 @@ BERT_download = function(models=NULL) {
 #' @export
 BERT_info = function(models=NULL) {
   transformers = transformers_init(print.info=FALSE)
-  cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
-  local.models = find_cached_models(cache.folder)
+  cache.folder = get_cache_folder(transformers)
+  local.models = get_cached_models(cache.folder)
   dm = data.table(model=row.names(local.models), size=local.models$size)
   model = NULL
   if(!is.null(models)) {
@@ -318,8 +338,9 @@ BERT_info = function(models=NULL) {
   }
   dm$size = str_remove(dm$size, " ")
   dm = cbind(dm, rbindlist(lapply(dm$model, function(model) {
-    tokenizer = transformers$AutoTokenizer$from_pretrained(model, local_files_only=TRUE)
-    model.obj = transformers$AutoModel$from_pretrained(model, local_files_only=TRUE)
+    model.local = get_cached_model_path(cache.folder, model)
+    tokenizer = transformers$AutoTokenizer$from_pretrained(model.local, local_files_only=TRUE)
+    model.obj = transformers$AutoModel$from_pretrained(model.local, local_files_only=TRUE)
     word.embeddings = model.obj$embeddings$word_embeddings$weight$data$shape
     data.table(vocab = word.embeddings[0],
                dims = word.embeddings[1],
@@ -430,7 +451,7 @@ BERT_vocab = function(
 #' @export
 FMAT_load = function(models) {
   transformers = transformers_init()
-  cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
+  cache.folder = get_cache_folder(transformers)
   cli::cli_text("Loading models from {.path {cache.folder}} ...")
   fms = lapply(models, function(model) {
     t0 = Sys.time()
@@ -461,9 +482,9 @@ fix_pair = function(X, var="MASK") {
 
 
 # query = "[MASK] is ABC."
-# expand_pair(query, .(High=cc("high, strong"), Low=cc("low, weak")))
+# expand_pair(query, .(High=c("high", "strong"), Low=c("low", "weak")))
 # expand_pair(query, .(H="high", M="medium", L="low"))
-# X = .(Flower=cc("rose, iris, lily"), Pos=cc("health, happiness, love, peace"))
+# X = .(Flower=c("rose", "iris", "lily"), Pos=c("health", "happiness", "love", "peace"))
 # expand_full(query, X)
 
 
@@ -556,31 +577,22 @@ append_X = function(dq, X, var="TARGET") {
 #' [`FMAT_run`]
 #'
 #' @examples
-#' FMAT_query("[MASK] is a nurse.", MASK = .(Male="He", Female="She"))
+#' \donttest{FMAT_query("[MASK] is a nurse.", MASK = .(Male="He", Female="She"))
 #'
 #' FMAT_query(
 #'   c("[MASK] is {TARGET}.", "[MASK] works as {TARGET}."),
 #'   MASK = .(Male="He", Female="She"),
-#'   TARGET = .(Occupation=cc("a doctor, a nurse, an artist"))
+#'   TARGET = .(Occupation=c("a doctor", "a nurse", "an artist"))
 #' )
 #'
 #' FMAT_query(
 #'   "The [MASK] {ATTRIB}.",
-#'   MASK = .(Male=cc("man, boy"),
-#'            Female=cc("woman, girl")),
-#'   ATTRIB = .(Masc=cc("is masculine, has a masculine personality"),
-#'              Femi=cc("is feminine, has a feminine personality"))
+#'   MASK = .(Male=c("man", "boy"),
+#'            Female=c("woman", "girl")),
+#'   ATTRIB = .(Masc=c("is masculine", "has a masculine personality"),
+#'              Femi=c("is feminine", "has a feminine personality"))
 #' )
-#'
-#' FMAT_query(
-#'   "The association between {TARGET} and {ATTRIB} is [MASK].",
-#'   MASK = .(H="strong", L="weak"),
-#'   TARGET = .(Flower=cc("rose, iris, lily"),
-#'              Insect=cc("ant, cockroach, spider")),
-#'   ATTRIB = .(Pos=cc("health, happiness, love, peace"),
-#'              Neg=cc("death, sickness, hatred, disaster"))
-#' )
-#'
+#' }
 #' @export
 FMAT_query = function(
     query = "Text with [MASK], optionally with {TARGET} and/or {ATTRIB}.",
@@ -658,19 +670,19 @@ FMAT_query = function(
 #' [`FMAT_run`]
 #'
 #' @examples
-#' FMAT_query_bind(
+#' \donttest{FMAT_query_bind(
 #'   FMAT_query(
 #'     "[MASK] is {TARGET}.",
 #'     MASK = .(Male="He", Female="She"),
-#'     TARGET = .(Occupation=cc("a doctor, a nurse, an artist"))
+#'     TARGET = .(Occupation=c("a doctor", "a nurse", "an artist"))
 #'   ),
 #'   FMAT_query(
 #'     "[MASK] occupation is {TARGET}.",
 #'     MASK = .(Male="His", Female="Her"),
-#'     TARGET = .(Occupation=cc("doctor, nurse, artist"))
+#'     TARGET = .(Occupation=c("doctor", "nurse", "artist"))
 #'   )
 #' )
-#'
+#' }
 #' @export
 FMAT_query_bind = function(...) {
   types = sapply(list(...), attr, "type")
@@ -791,22 +803,21 @@ FMAT_query_bind = function(...) {
 #' query1 = FMAT_query(
 #'   c("[MASK] is {TARGET}.", "[MASK] works as {TARGET}."),
 #'   MASK = .(Male="He", Female="She"),
-#'   TARGET = .(Occupation=cc("a doctor, a nurse, an artist"))
+#'   TARGET = .(Occupation=c("a doctor", "a nurse", "an artist"))
 #' )
 #' data1 = FMAT_run(models, query1)
 #' summary(data1, target.pair=FALSE)
 #'
 #' query2 = FMAT_query(
 #'   "The [MASK] {ATTRIB}.",
-#'   MASK = .(Male=cc("man, boy"),
-#'            Female=cc("woman, girl")),
-#'   ATTRIB = .(Masc=cc("is masculine, has a masculine personality"),
-#'              Femi=cc("is feminine, has a feminine personality"))
+#'   MASK = .(Male=c("man", "boy"),
+#'            Female=c("woman", "girl")),
+#'   ATTRIB = .(Masc=c("is masculine", "has a masculine personality"),
+#'              Femi=c("is feminine", "has a feminine personality"))
 #' )
 #' data2 = FMAT_run(models, query2)
 #' summary(data2, mask.pair=FALSE)
 #' summary(data2)
-#'
 #' }
 #'
 #' @export
@@ -833,7 +844,7 @@ FMAT_run = function(
       rather than the returned object from `FMAT_load()`.", call.=FALSE)
   } else {
     transformers = transformers_init()
-    cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
+    cache.folder = get_cache_folder(transformers)
     cli::cli_text("Loading models from {.path {cache.folder}} ...")
     cat("\n")
   }

diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ Han-Wu-Shuang (Bruce) Bao 包寒吴霜
 
 ## Installation
 
-To use the FMAT, the R package `FMAT` and two Python packages (`transformers` and `torch`) all need to be installed.
+To use the FMAT, the R package `FMAT` and three Python packages (`transformers`, `torch`, `huggingface-hub`) all need to be installed.
 
 ### (1) R Package
 
@@ -48,22 +48,36 @@ devtools::install_github("psychbruce/FMAT", force=TRUE)
 
 ### (2) Python Environment and Packages
 
-Install [Anaconda](https://www.anaconda.com/download) (a recommended package manager which automatically installs Python, Python IDEs like Spyder, and a large list of necessary [Python package dependencies](https://docs.anaconda.com/free/anaconda/pkg-docs/)).
+Install [Anaconda](https://www.anaconda.com/download/success) (a recommended package manager which automatically installs Python, Python IDEs like Spyder, and a large list of necessary [Python package dependencies](https://docs.anaconda.com/free/anaconda/allpkglists/)).
 
-Specify the Python interpreter in RStudio.
+Specify the Anaconda's Python interpreter in RStudio.
 
 > RStudio → Tools → Global/Project Options\
 > → Python → Select → **Conda Environments**\
 > → Choose **".../Anaconda3/python.exe"**
 
-Install the "[transformers](https://huggingface.co/docs/transformers/installation)" and "[torch](https://pytorch.org/get-started/locally/)" Python packages.\
-(Windows Command / Anaconda Prompt / RStudio Terminal)
+Install specific versions of Python packages "[transformers](https://pypi.org/project/transformers/#history)", "[torch](https://pypi.org/project/torch/#history)", and "[huggingface-hub](https://pypi.org/project/huggingface-hub/#history)".\
+(RStudio Terminal / Anaconda Prompt / Windows Command)
+
+For CPU users:
+
+```         
+pip install transformers==4.40.2 torch==2.2.1 huggingface-hub==0.20.3
+```
+
+For GPU (CUDA) users:
 
 ```         
-pip install transformers torch
+pip install transformers==4.40.2 huggingface-hub==0.20.3
+pip install torch==2.2.1 --index-url https://download.pytorch.org/whl/cu121
 ```
 
-See [Guidance for GPU Acceleration] for installation guidance if you have an NVIDIA GPU device on your PC and want to use GPU to accelerate the pipeline.
+-   See [Guidance for GPU Acceleration] for installation guidance if you have an NVIDIA GPU device on your PC and want to use GPU to accelerate the pipeline.
+-   According to the May 2024 releases, "transformers" ≥ 4.41 depends on "huggingface-hub" ≥ 0.23. The suggested versions of "transformers" (4.40.2) and "huggingface-hub" (0.20.3) ensure the console display of progress bars when downloading BERT models while keeping these packages as new as possible.
+-   Proxy users should use the "global mode" (全局模式) to download models.
+-   If you see the error `HTTPSConnectionPool(host='huggingface.co', port=443)`, please try to (1) reinstall [Anaconda](https://www.anaconda.com/download/success) so that some unknown issues may be fixed or (2) downgrade the "[urllib3](https://pypi.org/project/urllib3/)" package to version ≤ 1.25.11 (`pip install urllib3==1.25.11`) so that it will use HTTP proxies (rather than HTTPS proxies as in later versions) to connect to Hugging Face.
+    -   <https://www.cnblogs.com/devilmaycry812839668/p/17872452.html>
+    -   <https://zhuanlan.zhihu.com/p/350015032>
 
 ## Guidance for FMAT
 
@@ -112,10 +126,10 @@ Checklist:
     -   You may also install the corresponding version of CUDA Toolkit (e.g., for the `torch` version supporting CUDA 12.1, the same version of [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-12-1-0-download-archive) may also be installed).
 
 Example code for installing PyTorch with CUDA support:\
-(Windows Command / Anaconda Prompt / RStudio Terminal)
+(RStudio Terminal / Anaconda Prompt / Windows Command)
 
 ```         
-pip install torch --index-url https://download.pytorch.org/whl/cu121
+pip install torch==2.2.1 --index-url https://download.pytorch.org/whl/cu121
 ```
 
 ## BERT Models