Skip to content

Commit

Permalink
cleaned
Browse files Browse the repository at this point in the history
  • Loading branch information
behrica committed Nov 2, 2024
1 parent c75e86b commit 6061243
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 17 deletions.
3 changes: 2 additions & 1 deletion exp/exp.clj
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
[tech.v3.dataset.column-filters :as cf])
(:import [java.util.zip GZIPInputStream]
[ml.dmlc.xgboost4j.java XGBoost]))
(def max-lines 10000) ; fails with 10000
(def max-lines 1000) ; fails with 10000

(defn deterministic-shuffle
[^java.util.Collection coll seed]
Expand All @@ -31,6 +31,7 @@
#(str/split % #" ")
:max-lines max-lines
:skip-lines 1
:datatype-document :int32
:datatype-token-idx :int32)
:datasets
first
Expand Down
1 change: 0 additions & 1 deletion src/scicloj/ml/xgboost.clj
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,6 @@ subsample may be set to as low as 0.1 without loss of model accuracy. Note that
(tc/select-columns [:document :token-idx text-feature-column])
(tc/rows))

;n-col (inc (apply max (bow-zeroed :token-idx)))

;_ (println :n-col n-col)

Expand Down
30 changes: 17 additions & 13 deletions src/scicloj/ml/xgboost/csr.clj
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,25 @@
:row-pointers new-row-pointers})))

(defn ->csr [r-c-vs]
(println :->csr :count (count r-c-vs))
;; data gets sorted by r and c
;; not sure, if good idea for performace ?

(->
(reduce
(fn [csr [row col value]]
(add-to-csr csr row col value))
{:values (dt/make-list :float)
:column-indices (dt/make-list :int)
:row-pointers (dt/make-list :long [0])}
(sort-by (juxt first second)
r-c-vs))

(#(assoc % :row-pointers (conj (:row-pointers %)
(count (:values %)))))))

(let [ r-c-v-maps
(->> r-c-vs
( (fn [it] (println :sort) it))
(sort-by (juxt first second))
( (fn [it] (println :reduce) it))
(reduce
(fn [csr [row col value]]
(add-to-csr csr row col value))
{:values (dt/make-list :float)
:column-indices (dt/make-list :int)
:row-pointers (dt/make-list :long [0])}))]

(assoc r-c-v-maps :row-pointers
(conj (:row-pointers r-c-v-maps)
(count (:values r-c-v-maps))))))


(defn- first-non-nil-or-0 [s]
Expand Down
7 changes: 5 additions & 2 deletions test/scicloj/ml/text_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,15 @@
)


n-sparse-columns (inc (apply max (ds :token-idx)))
m-train (xgboost/tidy-text-bow-ds->dmatrix (cf/feature bow-train)
(tc/select-columns bow-train [:label])
:tfidf)
:tfidf
n-sparse-columns)
m-test (xgboost/tidy-text-bow-ds->dmatrix (cf/feature bow-test)
(tc/select-columns bow-test [:label])
:tfidf)
:tfidf
n-sparse-columns)

model
(xgboost/train-from-dmatrix
Expand Down

0 comments on commit 6061243

Please sign in to comment.