Skip to content

Commit

Permalink
[2024.6]
Browse files Browse the repository at this point in the history
  • Loading branch information
Bruce committed Jun 13, 2024
1 parent 18cc226 commit 556010b
Show file tree
Hide file tree
Showing 11 changed files with 119 additions and 114 deletions.
6 changes: 3 additions & 3 deletions CRAN-SUBMISSION
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Version: 2024.5
Date: 2024-05-19 05:24:12 UTC
SHA: e9adda42c47daffb54fbab63ec47cd2b185be9e8
Version: 2024.6
Date: 2024-06-12 14:38:07 UTC
SHA: 18cc22695b8c596044f4d2e85f6373ba37012258
11 changes: 5 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: FMAT
Title: The Fill-Mask Association Test
Version: 2024.5
Date: 2024-05-15
Version: 2024.6
Date: 2024-06-12
Authors@R:
c(person(given = "Han-Wu-Shuang",
family = "Bao",
Expand All @@ -27,9 +27,8 @@ BugReports: https://github.com/psychbruce/FMAT/issues
SystemRequirements: Python (>= 3.9.0)
Depends: R (>= 4.0.0)
Imports:
PsychWordVec, reticulate,
data.table, stringr, forcats, psych, irr,
glue, cli, purrr, plyr, dplyr, tidyr
Suggests: bruceR, text, nlme
reticulate, data.table, stringr, forcats, psych, irr,
glue, crayon, cli, purrr, plyr, dplyr, tidyr
Suggests: bruceR, PsychWordVec, text, sweater, nlme
RoxygenNote: 7.3.1
Roxygen: list(markdown = TRUE)
7 changes: 5 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ export(FMAT_query_bind)
export(FMAT_run)
export(ICC_models)
export(LPR_reliability)
export(cc)
import(data.table)
import(stringr)
importFrom(PsychWordVec,cc)
importFrom(crayon,blue)
importFrom(crayon,green)
importFrom(crayon,italic)
importFrom(crayon,magenta)
importFrom(crayon,underline)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(forcats,as_factor)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
**Please check the [latest news (change log)](https://psychbruce.github.io/FMAT/news/index.html) and keep this package updated.**

# FMAT 2024.6

- Fixed bugs: Now only `BERT_download()` connects to the Internet, while all the other functions run in an offline way.
- Improved installation guidance for Python packages.

# FMAT 2024.5

- Added `BERT_info()`.
Expand Down
111 changes: 61 additions & 50 deletions R/FMAT.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#' @importFrom dplyr left_join mutate
#' @importFrom forcats as_factor
#' @importFrom stats na.omit
#' @importFrom crayon italic underline green blue magenta
.onAttach = function(libname, pkgname) {
inst.ver = as.character(utils::packageVersion("FMAT"))
pkg.date = substr(utils::packageDate("FMAT"), 1, 4)
Expand Down Expand Up @@ -45,9 +46,9 @@
#### Utils ####


#' @importFrom PsychWordVec cc
#' @export
PsychWordVec::cc
## #' @importFrom PsychWordVec cc
## #' @export
## PsychWordVec::cc


#' A simple function equivalent to `list`.
Expand All @@ -57,8 +58,7 @@ PsychWordVec::cc
#' @return A list of named objects.
#'
#' @examples
#' .(Male=cc("he, his"), Female=cc("she, her"))
#' list(Male=cc("he, his"), Female=cc("she, her")) # the same
#' .(Male=c("he", "his"), Female=c("she", "her"))
#'
#' @export
. = function(...) list(...)
Expand Down Expand Up @@ -113,12 +113,14 @@ transformers_init = function(print.info=TRUE) {
gpu.info = paste("GPU (Device):", paste(torch$cuda$get_device_name(), collapse=", "))
} else {
cuda.ver = "NULL"
gpu.info = paste("(To use GPU, install PyTorch with CUDA support,",
"see https://pytorch.org/get-started)")
gpu.info = "To use GPU, see https://psychbruce.github.io/FMAT/#guidance-for-gpu-acceleration"
}

transformers = reticulate::import("transformers")
tf.ver = transformers$`__version__`

hfh.ver = reticulate::import("huggingface_hub")$`__version__`
url.ver = reticulate::import("urllib3")$`__version__`
})
if(print.info) {
cli::cli_alert_info(cli::col_blue("Device Info:
Expand All @@ -130,6 +132,8 @@ transformers_init = function(print.info=TRUE) {
Python Packages:
transformers {tf.ver}
torch {torch.ver}
urllib3 {url.ver}
huggingface-hub {hfh.ver}
NVIDIA GPU CUDA Support:
CUDA Enabled: {torch.cuda}
Expand All @@ -142,8 +146,10 @@ transformers_init = function(print.info=TRUE) {


fill_mask_init = function(transformers, model, device=-1L) {
config = transformers$AutoConfig$from_pretrained(model, local_files_only=TRUE)
fill_mask = transformers$pipeline("fill-mask", model=model, config=config,
cache.folder = get_cache_folder(transformers)
model.local = get_cached_model_path(cache.folder, model)
config = transformers$AutoConfig$from_pretrained(model.local, local_files_only=TRUE)
fill_mask = transformers$pipeline("fill-mask", model=model.local, config=config,
model_kwargs=list(local_files_only=TRUE),
device=device)
return(fill_mask)
Expand Down Expand Up @@ -207,22 +213,36 @@ add_tokens = function(
}


find_cached_models = function(cache.folder) {
get_cache_folder = function(transformers) {
str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
}


get_cached_models = function(cache.folder) {
models.name = list.files(cache.folder, "^models--")
if(length(models.name) > 0) {
models.size = sapply(paste0(cache.folder, "/", models.name), function(folder) {
models.info = do.call("rbind", lapply(paste0(cache.folder, "/", models.name), function(folder) {
models.file = list.files(folder, pattern="(model.safetensors$|pytorch_model.bin$|tf_model.h5$)", recursive=TRUE, full.names=TRUE)
paste(paste0(sprintf("%.0f", file.size(models.file) / 1024^2), " MB"), collapse=" / ")
})
size = paste(paste0(sprintf("%.0f", file.size(models.file) / 1024^2), " MB"), collapse=" / ")
download.date = paste(str_remove(file.mtime(models.file), " .*"), collapse=" / ")
return(data.frame(size, download.date))
}))
models.name = str_replace_all(str_remove(models.name, "^models--"), "--", "/")
models.info = data.frame(size=models.size, row.names=models.name)
row.names(models.info) = models.name
} else {
models.info = NULL
}
return(models.info)
}


get_cached_model_path = function(cache.folder, model) {
model.folder = paste0(cache.folder, "/models--", str_replace_all(model, "/", "--"))
model.path = list.files(model.folder, pattern="(model.safetensors$|pytorch_model.bin$|tf_model.h5$)", recursive=TRUE, full.names=TRUE)[1]
return(dirname(model.path))
}


#### BERT ####


Expand Down Expand Up @@ -269,9 +289,9 @@ BERT_download = function(models=NULL) {
gc()
})
}
cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
cache.folder = get_cache_folder(transformers)
cache.sizegb = sum(file.size(list.files(cache.folder, recursive=TRUE, full.names=TRUE))) / 1024^3
local.models = find_cached_models(cache.folder)
local.models = get_cached_models(cache.folder)
cli::cli_h2("Downloaded models:")
print(local.models)
cat("\n")
Expand Down Expand Up @@ -305,8 +325,8 @@ BERT_download = function(models=NULL) {
#' @export
BERT_info = function(models=NULL) {
transformers = transformers_init(print.info=FALSE)
cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
local.models = find_cached_models(cache.folder)
cache.folder = get_cache_folder(transformers)
local.models = get_cached_models(cache.folder)
dm = data.table(model=row.names(local.models), size=local.models$size)
model = NULL
if(!is.null(models)) {
Expand All @@ -318,8 +338,9 @@ BERT_info = function(models=NULL) {
}
dm$size = str_remove(dm$size, " ")
dm = cbind(dm, rbindlist(lapply(dm$model, function(model) {
tokenizer = transformers$AutoTokenizer$from_pretrained(model, local_files_only=TRUE)
model.obj = transformers$AutoModel$from_pretrained(model, local_files_only=TRUE)
model.local = get_cached_model_path(cache.folder, model)
tokenizer = transformers$AutoTokenizer$from_pretrained(model.local, local_files_only=TRUE)
model.obj = transformers$AutoModel$from_pretrained(model.local, local_files_only=TRUE)
word.embeddings = model.obj$embeddings$word_embeddings$weight$data$shape
data.table(vocab = word.embeddings[0],
dims = word.embeddings[1],
Expand Down Expand Up @@ -430,7 +451,7 @@ BERT_vocab = function(
#' @export
FMAT_load = function(models) {
transformers = transformers_init()
cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
cache.folder = get_cache_folder(transformers)
cli::cli_text("Loading models from {.path {cache.folder}} ...")
fms = lapply(models, function(model) {
t0 = Sys.time()
Expand Down Expand Up @@ -461,9 +482,9 @@ fix_pair = function(X, var="MASK") {


# query = "[MASK] is ABC."
# expand_pair(query, .(High=cc("high, strong"), Low=cc("low, weak")))
# expand_pair(query, .(High=c("high", "strong"), Low=c("low", "weak")))
# expand_pair(query, .(H="high", M="medium", L="low"))
# X = .(Flower=cc("rose, iris, lily"), Pos=cc("health, happiness, love, peace"))
# X = .(Flower=c("rose", "iris", "lily"), Pos=c("health", "happiness", "love", "peace"))
# expand_full(query, X)


Expand Down Expand Up @@ -556,31 +577,22 @@ append_X = function(dq, X, var="TARGET") {
#' [`FMAT_run`]
#'
#' @examples
#' FMAT_query("[MASK] is a nurse.", MASK = .(Male="He", Female="She"))
#' \donttest{FMAT_query("[MASK] is a nurse.", MASK = .(Male="He", Female="She"))
#'
#' FMAT_query(
#' c("[MASK] is {TARGET}.", "[MASK] works as {TARGET}."),
#' MASK = .(Male="He", Female="She"),
#' TARGET = .(Occupation=cc("a doctor, a nurse, an artist"))
#' TARGET = .(Occupation=c("a doctor", "a nurse", "an artist"))
#' )
#'
#' FMAT_query(
#' "The [MASK] {ATTRIB}.",
#' MASK = .(Male=cc("man, boy"),
#' Female=cc("woman, girl")),
#' ATTRIB = .(Masc=cc("is masculine, has a masculine personality"),
#' Femi=cc("is feminine, has a feminine personality"))
#' MASK = .(Male=c("man", "boy"),
#' Female=c("woman", "girl")),
#' ATTRIB = .(Masc=c("is masculine", "has a masculine personality"),
#' Femi=c("is feminine", "has a feminine personality"))
#' )
#'
#' FMAT_query(
#' "The association between {TARGET} and {ATTRIB} is [MASK].",
#' MASK = .(H="strong", L="weak"),
#' TARGET = .(Flower=cc("rose, iris, lily"),
#' Insect=cc("ant, cockroach, spider")),
#' ATTRIB = .(Pos=cc("health, happiness, love, peace"),
#' Neg=cc("death, sickness, hatred, disaster"))
#' )
#'
#' }
#' @export
FMAT_query = function(
query = "Text with [MASK], optionally with {TARGET} and/or {ATTRIB}.",
Expand Down Expand Up @@ -658,19 +670,19 @@ FMAT_query = function(
#' [`FMAT_run`]
#'
#' @examples
#' FMAT_query_bind(
#' \donttest{FMAT_query_bind(
#' FMAT_query(
#' "[MASK] is {TARGET}.",
#' MASK = .(Male="He", Female="She"),
#' TARGET = .(Occupation=cc("a doctor, a nurse, an artist"))
#' TARGET = .(Occupation=c("a doctor", "a nurse", "an artist"))
#' ),
#' FMAT_query(
#' "[MASK] occupation is {TARGET}.",
#' MASK = .(Male="His", Female="Her"),
#' TARGET = .(Occupation=cc("doctor, nurse, artist"))
#' TARGET = .(Occupation=c("doctor", "nurse", "artist"))
#' )
#' )
#'
#' }
#' @export
FMAT_query_bind = function(...) {
types = sapply(list(...), attr, "type")
Expand Down Expand Up @@ -791,22 +803,21 @@ FMAT_query_bind = function(...) {
#' query1 = FMAT_query(
#' c("[MASK] is {TARGET}.", "[MASK] works as {TARGET}."),
#' MASK = .(Male="He", Female="She"),
#' TARGET = .(Occupation=cc("a doctor, a nurse, an artist"))
#' TARGET = .(Occupation=c("a doctor", "a nurse", "an artist"))
#' )
#' data1 = FMAT_run(models, query1)
#' summary(data1, target.pair=FALSE)
#'
#' query2 = FMAT_query(
#' "The [MASK] {ATTRIB}.",
#' MASK = .(Male=cc("man, boy"),
#' Female=cc("woman, girl")),
#' ATTRIB = .(Masc=cc("is masculine, has a masculine personality"),
#' Femi=cc("is feminine, has a feminine personality"))
#' MASK = .(Male=c("man", "boy"),
#' Female=c("woman", "girl")),
#' ATTRIB = .(Masc=c("is masculine", "has a masculine personality"),
#' Femi=c("is feminine", "has a feminine personality"))
#' )
#' data2 = FMAT_run(models, query2)
#' summary(data2, mask.pair=FALSE)
#' summary(data2)
#'
#' }
#'
#' @export
Expand All @@ -833,7 +844,7 @@ FMAT_run = function(
rather than the returned object from `FMAT_load()`.", call.=FALSE)
} else {
transformers = transformers_init()
cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
cache.folder = get_cache_folder(transformers)
cli::cli_text("Loading models from {.path {cache.folder}} ...")
cat("\n")
}
Expand Down
32 changes: 23 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Han-Wu-Shuang (Bruce) Bao 包寒吴霜

## Installation

To use the FMAT, the R package `FMAT` and two Python packages (`transformers` and `torch`) all need to be installed.
To use the FMAT, the R package `FMAT` and three Python packages (`transformers`, `torch`, `huggingface-hub`) all need to be installed.

### (1) R Package

Expand All @@ -48,22 +48,36 @@ devtools::install_github("psychbruce/FMAT", force=TRUE)

### (2) Python Environment and Packages

Install [Anaconda](https://www.anaconda.com/download) (a recommended package manager which automatically installs Python, Python IDEs like Spyder, and a large list of necessary [Python package dependencies](https://docs.anaconda.com/free/anaconda/pkg-docs/)).
Install [Anaconda](https://www.anaconda.com/download/success) (a recommended package manager which automatically installs Python, Python IDEs like Spyder, and a large list of necessary [Python package dependencies](https://docs.anaconda.com/free/anaconda/allpkglists/)).

Specify the Python interpreter in RStudio.
Specify the Anaconda's Python interpreter in RStudio.

> RStudio → Tools → Global/Project Options\
> → Python → Select → **Conda Environments**\
> → Choose **".../Anaconda3/python.exe"**
Install the "[transformers](https://huggingface.co/docs/transformers/installation)" and "[torch](https://pytorch.org/get-started/locally/)" Python packages.\
(Windows Command / Anaconda Prompt / RStudio Terminal)
Install specific versions of Python packages "[transformers](https://pypi.org/project/transformers/#history)", "[torch](https://pypi.org/project/torch/#history)", and "[huggingface-hub](https://pypi.org/project/huggingface-hub/#history)".\
(RStudio Terminal / Anaconda Prompt / Windows Command)

For CPU users:

```
pip install transformers==4.40.2 torch==2.2.1 huggingface-hub==0.20.3
```

For GPU (CUDA) users:

```
pip install transformers torch
pip install transformers==4.40.2 huggingface-hub==0.20.3
pip install torch==2.2.1 --index-url https://download.pytorch.org/whl/cu121
```

See [Guidance for GPU Acceleration] for installation guidance if you have an NVIDIA GPU device on your PC and want to use GPU to accelerate the pipeline.
- See [Guidance for GPU Acceleration] for installation guidance if you have an NVIDIA GPU device on your PC and want to use GPU to accelerate the pipeline.
- According to the May 2024 releases, "transformers" ≥ 4.41 depends on "huggingface-hub" ≥ 0.23. The suggested versions of "transformers" (4.40.2) and "huggingface-hub" (0.20.3) ensure the console display of progress bars when downloading BERT models while keeping these packages as new as possible.
- Proxy users should use the "global mode" (全局模式) to download models.
- If you see the error `HTTPSConnectionPool(host='huggingface.co', port=443)`, please try to (1) reinstall [Anaconda](https://www.anaconda.com/download/success) so that some unknown issues may be fixed or (2) downgrade the "[urllib3](https://pypi.org/project/urllib3/)" package to version ≤ 1.25.11 (`pip install urllib3==1.25.11`) so that it will use HTTP proxies (rather than HTTPS proxies as in later versions) to connect to Hugging Face.
- <https://www.cnblogs.com/devilmaycry812839668/p/17872452.html>
- <https://zhuanlan.zhihu.com/p/350015032>

## Guidance for FMAT

Expand Down Expand Up @@ -112,10 +126,10 @@ Checklist:
- You may also install the corresponding version of CUDA Toolkit (e.g., for the `torch` version supporting CUDA 12.1, the same version of [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-12-1-0-download-archive) may also be installed).

Example code for installing PyTorch with CUDA support:\
(Windows Command / Anaconda Prompt / RStudio Terminal)
(RStudio Terminal / Anaconda Prompt / Windows Command)

```
pip install torch --index-url https://download.pytorch.org/whl/cu121
pip install torch==2.2.1 --index-url https://download.pytorch.org/whl/cu121
```

## BERT Models
Expand Down
Loading

0 comments on commit 556010b

Please sign in to comment.