Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Container and Block for Text #207

Merged
merged 25 commits into from
May 12, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
a8d0f52
Add basic Text module and sample recipe.
Chandu-4444 Mar 26, 2022
6ea531c
Add docstrings
Chandu-4444 Mar 26, 2022
c533ce3
Start adding text transforms
Chandu-4444 Mar 30, 2022
50cfd2e
Remove basic preprocessing functions
Chandu-4444 Mar 30, 2022
e51b866
Add xxbos transform and minor updates
Chandu-4444 Mar 30, 2022
15f710c
Update src/Text/Text.jl
Chandu-4444 Mar 31, 2022
2d5e4ba
Update src/Text/recipes.jl
Chandu-4444 Mar 31, 2022
d44e1b0
Update TextBlock documentation.
Chandu-4444 Mar 31, 2022
e714966
Update declaration of `checkblock` method
Chandu-4444 Apr 1, 2022
357eaf0
Update Text.jl to remove an unexpected error
Chandu-4444 Apr 1, 2022
cbebcb6
Update src/Text/recipes.jl
Chandu-4444 Apr 1, 2022
a18d72d
Update src/Text/transform.jl
Chandu-4444 Apr 1, 2022
1a1266a
Update src/datasets/containers.jl
Chandu-4444 Apr 1, 2022
d94c94e
Update `recipes.jl` with suggestions provided
Chandu-4444 Apr 1, 2022
ce50151
Change `TextBlock` to more reasonable `Paragraph`
Chandu-4444 Apr 4, 2022
db511fb
Change `Text` to `Textual` to resolve conflict
Chandu-4444 Apr 4, 2022
afc479f
Remove type annotations for text transforms
Chandu-4444 Apr 4, 2022
5a779cf
Add mockblock for text
Chandu-4444 Apr 16, 2022
98df8ec
Merge branch 'FluxML:master' into master
Chandu-4444 Apr 20, 2022
b6224b4
Add simple test for `TextFolders`
Chandu-4444 Apr 20, 2022
14a96d0
Remove tests
Chandu-4444 Apr 21, 2022
cb561b7
Add test (again).
Chandu-4444 Apr 21, 2022
8a17345
Update src/Textual/blocks/text.jl
Chandu-4444 Apr 23, 2022
70405ac
Update test for `TextFolders`
Chandu-4444 Apr 23, 2022
290ae32
Merge branch 'FluxML:master' into master
Chandu-4444 May 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/FastAI.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ export Vision
include("Tabular/Tabular.jl")
@reexport using .Tabular

include("Text/Text.jl")
@reexport using .Text

include("deprecations.jl")
export
Expand Down Expand Up @@ -173,6 +175,7 @@ export
TableRow,
Continuous,
Image,
TextBlock,

# encodings
encode,
Expand Down
35 changes: 35 additions & 0 deletions src/Text/Text.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
module Text


using ..FastAI
using ..FastAI:
# blocks
Block, WrapperBlock, AbstractBlock, OneHotTensor, OneHotTensorMulti, Label,
LabelMulti, wrapped, Continuous, getencodings, getblocks, encodetarget, encodeinput,
# encodings
Encoding, StatefulEncoding, OneHot,
# visualization
ShowText,
# other
Context, Training, Validation, FASTAI_METHOD_REGISTRY, registerlearningtask!

import Requires: @require

include("recipes.jl")
include("blocks/text.jl")
include("transform.jl")

function __init__()
_registerrecipes()
@require Makie="ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" begin
import .Makie
import .Makie: @recipe, @lift
import .FastAI: ShowMakie
include("makie.jl")
end
Chandu-4444 marked this conversation as resolved.
Show resolved Hide resolved
end

export TextBlock, TextFolders, replace_all_caps, replace_sentence_case,
convert_lowercase
Chandu-4444 marked this conversation as resolved.
Show resolved Hide resolved
end

1 change: 1 addition & 0 deletions src/Text/blocks/text.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
struct TextBlock <: Block end
Chandu-4444 marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions src/Text/makie.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# # No Makie recipes yet, text is better I guess
45 changes: 45 additions & 0 deletions src/Text/recipes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
TextFolders(textfile; labelfn = parentname, split = false)

Recipe for loading a single-label text classification dataset
stored in hierarchical folder format.
"""

Chandu-4444 marked this conversation as resolved.
Show resolved Hide resolved
Base.@kwdef struct TextFolders <: Datasets.DatasetRecipe
labelfn = parentname
split::Bool = false
filefilterfn = _ -> true
end

Datasets.recipeblocks(::Type{TextFolders}) = Tuple{TextBlock, Label}

function Datasets.loadrecipe(recipe::TextFolders, path)
isdir(path) || error("$path is not a directory")
data = loadfolderdata(
path,
filterfn=f -> istextfile(f) && recipe.filefilterfn(f),
loadfn=(loadfile, recipe.labelfn),
splitfn=recipe.split ? grandparentname : nothing)

(recipe.split ? length(data) > 0 : nobs(data) > 0) || error("No text files found in $path")
ToucheSir marked this conversation as resolved.
Show resolved Hide resolved

labels = recipe.split ? first(values(data))[2] : data[2]
blocks = TextBlock(), Label(unique(eachobs(labels)))
Chandu-4444 marked this conversation as resolved.
Show resolved Hide resolved
length(blocks[2].classes) > 1 || error("Expected multiple different labels, got: $(blocks[2].classes))")
return data, blocks
end

# Registering recipes

const RECIPES = Dict{String,Vector{Datasets.DatasetRecipe}}(
"imdb" => [TextFolders(
filefilterfn = f->!contains(f, "tmp_clas") && !contains(f, "tmp_lm") && !contains(f, "unsup")
Chandu-4444 marked this conversation as resolved.
Show resolved Hide resolved
)],
)

function _registerrecipes()
for (name, recipes) in RECIPES, recipe in recipes
Datasets.registerrecipe!(Datasets.FASTAI_DATA_REGISTRY, name, recipe)
end
end

25 changes: 25 additions & 0 deletions src/Text/transform.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
replace_all_caps(String)

Replace tokens in ALL CAPS by their lower version and add xxup before.
"""

function replace_all_caps(t::String)
t = replace(t, r"([A-Z]+[^a-z\s]*)(?=(\s|$))" => s"xxup \1")
t = replace(t, r"([A-Z]*[^a-z\s]+)(?=(\s|$))" => lowercase)
end

"""
replace_sentence_case(String)

Replace tokens in Sentence Case by their lower verions and add xxmaj before.
"""

function replace_sentence_case(t::String)
t = replace(t, r"(?<!\w)([A-Z][A-Z0-9]*[a-z0-9]+)(?!\w)" => s"xxmaj \1")
t = replace(t, r"(?<!\w)([A-Z][A-Z0-9]*[a-z0-9]+)(?!\w)" => lowercase)
end

function convert_lowercase(t::String)
string("xxbos ", lowercase(t))
end
Chandu-4444 marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions src/datasets/Datasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ export

# utilities
isimagefile,
istextfile,
matches,
loadfile,
loadmask,
Expand Down
2 changes: 2 additions & 0 deletions src/datasets/containers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ function loadfile(file::String)
return FileIO.load(file, view = true)
elseif endswith(file, ".csv")
return DataFrame(CSV.File(file))
elseif endswith(file, ".txt")
return String(read(file))
Chandu-4444 marked this conversation as resolved.
Show resolved Hide resolved
else
return FileIO.load(file)
end
Expand Down
2 changes: 2 additions & 0 deletions src/datasets/load.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ matches(re::Regex) = f -> matches(re, f)
matches(re::Regex, f) = !isnothing(match(re, f))
const RE_IMAGEFILE = r".*\.(gif|jpe?g|tiff?|png|webp|bmp)$"i
isimagefile(f) = matches(RE_IMAGEFILE, f)
const RE_TEXTFILE = r".*\.(txt|csv|json|md|html?|xml|yaml|toml)$"i
istextfile(f) = matches(RE_TEXTFILE, f)


maskfromimage(a::AbstractArray{<:Gray{T}}, classes) where T = maskfromimage(reinterpret(T, a), classes)
Expand Down