-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdt.lsp
715 lines (608 loc) · 32 KB
/
dt.lsp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
; DT-LEARN, DT-PERFORM ********************************************************
;
; DT-LEARN implements a decision tree learner based on ID3 with a few
; extensions from those parts of C4.5 relevant to categorical data.
; It generates code for a function that runs the tree, and writes it to
; a file. User can then load the file and use the tree by calling the
; function DT-PERFORM with one feature vector:
;
; (dt-perform vector)
;
; Call to dt-learn is:
;
; (dt-learn (list "file1" "file2" ...) excluded-words)
;
; Arguments "file1", "file2", etc., are paths, in the form of quoted strings,
; of files containing the dataset to be used.
;
; The dataset consists of one file per class, with the class name on the
; first line, followed by examples in that class, one per line.
;
; Examples are assumed to be in the form of a sequence of words -- a sentence --
; where the words are separated by a space.
;
; Excluded-words is a list of words that should be ignored.
;
; After the excluded words list, several keyword arguments can be included
;
; :min-samples <integer>
; If this many or fewer samples are available at a node, stop and make this
; a leaf. Default is 5. NOTE: May want to change this such that if some
; branches at a node contain less than min-samples samples, even though the
; node as a whole has > min-samples, the underpopulated branches are grouped.
; Make groups of no more than necessary to get above min-samples, and choose
; combinations that have similar main categories. (There will be few enough
; that grouping them will not be too time-consuming). If class-if-below-min
; is not 'vote, this grouping is moot.
;
; :max-depth <integer or NIL>
; If not NIL, quit at this many features in a conjunct (i.e. paths are
; limited to this many decision nodes. Default is NIL (i.e. don't constrain
; depth.
;
; :class-if-below-min <class or 'vote>
; If a subtree is truncated due to having fewer than the :min-samples cutoff
; or reaching the :max-depth cutoff, substitute this class. If the special
; name 'vote is used, take the majority class of the samples remaining at
; this node. Default is 'vote.
;
; :output-file <string>
; Name of the file to which the dt-perform code should be written.
; Default is "dt-perform.lsp".
;
; For instance, a call to dt-learn might look like:
;
; (dt-learn (list "target1.data" "target2.data" "target3.data")
; (list "a" "an" "the") :min-samples 0 :max-depth NIL)
;
; There is no return value. Instead, dt-learn produces code for function
; DT-PERFORM, and writes it to a file. Load this file, then call dt-perform
; with a single vector whose form is same as in the original data but without
; the assigned class (i.e. there is one fewer item in the comma-separated list).
; To run dt-perform:
;
; (load "dt-perform.lsp")
;
; (dt-perform <sentence>)
;
; where <sentence> is a list of words, e.g. (list "this" "is" "a" "sentence").
;
; Return value is a class.
; INPUT and associated TEXT MANIPULATION **************************************
; Input and text manipulation functions are in text.lsp -- get it.
(load "text.lsp");
; STRING HELPERS for OUTPUT ***************************************************
;
; Since we're producing code as output, we do a fair amount of string
; manipulation. In particular, we often need to make quoted strings.
; Make a quoted string out of the supplied text (itself a string).
(defun make-quoted-string (text)
(concatenate 'string
(princ-to-string #\")
text
(princ-to-string #\")))
; Helper for making strings: Given a list of strings, make a text string that's
; a Lisp-style list containing those strings.
(defun make-string-of-list (list-of-strings)
(concatenate 'string
; Start the list.
"(list "
; Insert strings, quoted and space-separated, as list contents.
(reduce
; Join new string to what we have so far.
#'(lambda (prev-string new-string)
(concatenate 'string
prev-string
" "
(make-quoted-string new-string)))
; Get strings out of the supplied list.
list-of-strings
; Start with an empty string so the first item gets processed as the
; "new-string" argument to the above function, and gets quoted.
:initial-value "")
; Close the list.
")"))
; Write out a string without escaped characters. This avoids getting strings
; with \" instead of ". Use "~A" not "~S" -- "~A" uses princ, which does not
; add escapes, whereas "~S" uses prin1, which does.
(defun write-without-escaped-chars (text output-file)
(with-open-file (output-stream output-file :direction :output)
(format output-stream "~A" text)))
; BIT OPERATIONS **************************************************************
;
; Because features are binary (presence or absence of words and word pairs),
; feature vectors can be represented compactly as integers. A few convenience
; functions will be useful.
; Make an integer with a one bit at the specified position.
(defun makebit (index)
(byte 1 index)) ; or could use (ash 1 index)
; Set the bit at the given index in the given number. To change the bit in
; place (i.e. destructively), setf this back into the original place.
(defun setbit (index number)
(logior number (byte 1 index))) ; or could do (dpb 1 (byte 1 index) number)
; but logior may be faster
; Turn off a bit.
(defun clrbit (index number)
(logandc2 number (byte 1 index)))
; To test whether a particular bit is on, do (logbitp index number).
; Compare a particular bit in a number to a supplied bit value.
(defun cmpbit (index number value)
(= value (ldb (byte 1 index) number)))
; Make sure a value is represented by 1 and 0 -- if needed, convert T and NIL
; to 1 and 0.
(defun bitvalue (value)
(if (or (null value) (equal value 0)) 0 1))
; DECISION TREE GENERATION ****************************************************
;
; Once we have the file read in, we fall into a tree traversal -- DFS, because
; the intent is to produce Lisp code for the conditional expression
; representing the tree, and will have nested expressions, which is natural with
; DFS. This conditional can then be wrapped in a defun and saved in a file for
; future use. I'll store it as a string, not a list, so I can write it out as a
; file without having to fight with the Lisp printer.
;
; Remember, in the following, that we are *writing code* to process *one* (real)
; sentence (not training data). (We process the training data right here.)
;
; We don't start with feature vectors in the traditional sense -- not a list of
; feature values -- so our first step is to create them. Even though there may
; be roughly a thousand features (e.g. for a lexicon of size 30 there are 930),
; it is still less work to test for features up front, because otherwise, when
; filtering down through the levels while generating the tree, we'd be doing a
; test for all but the previously-selected features at each tree level on every
; training sentence.
;
; On the other hand, we will not extract a feature vector for sentences
; *processed by* the tree while running, because we'll only be testing each
; feature once, and only a small fraction of the possible features.
;
; For our word or word pair features, we only have "present" or "absent" as
; values, i.e. all are binary. We use true to represent presence and NIL for
; absence -- this is what the feature predicate, contains-in-order, returns.
;
; At this point, we have for each class a collection of binary feature vectors.
;
; At any node, we test one feature, and split into branches for each of its
; values (there are no missing features, else we'd have a branch for that).
; This corresponds to an "if" statement.
;
; When we follow a branch, we select out only those sentences that have the
; feature value corresponding to that branch -- others are dropped. So at any
; node, we have data in exactly the same form as at the root -- just less of it.
;
; When we're creating the code for any node, our caller will have two string
; fragments that belong before and after our code. We'll just generate a
; string for this node as though it were the root, and pass it back. The
; caller will combine the results.
;
; The code to make one node does this: First we decide if it's an end node
; (i.e. if all remaining vectors belong to a single class, or we've dropped
; below min-samples, or the level has reached max-depth. If it is, we return a
; string containing the appropriate class, which depends on the user's
; settings -- it's either the most common class, or the one they specify.
;
; If it's not an end node, we calculate the change in information for each
; (remaining) feature.
;
; (Note re. information gain: ID3 compares the information gain, which is the
; current node's information minus that of all successors when split on an
; feature. However, there is no need to calculate the information in the
; current node, because that's the same for all features -- just need to
; remember that minus sign in front of the sum of information over the values.)
;
; Pick the best feature and make a test for it. This yields two branches, so
; make it an "if" statement.
;
; In the place where the result value of each "if" branch goes, make a
; recursive call with the subset of vectors belonging to this branch, and with
; the current feature filtered out. Insert the result into the "if" and
; close off this "if" option.
;
; Continue for the remaining branches, finish off the "if", and write it out.
; HELPERS for DECISION TREE GENERATION ****************************************
; Count the total number of vectors in vector-lists.
(defun total-vectors (vector-lists)
(reduce #'+ (mapcar #'length vector-lists)))
; Helper function that makes a list of integers from 0 to k-1 given k.
; (Numerical indices into the features are used freqently. This list assists
; in running mapcar over them.)
(defun make-index-list (num)
; Count backwards from k-1 and cons the numbers onto the front of a list.
(do
((index (1- num) (1- index))
(index-list NIL (cons index index-list)))
((= index -1) index-list)))
; Helper that locates the class with the largest number of examples, and
; gets the corresponding class name.
(defun find-biggest-class (classes vector-lists)
(let (; Step through vector lists; get length; compare with those already
; examined -- if bigger, get the index of the current list, else keep
; the old index. (A triple of values is used as an iterator: index of
; biggest class so far, its size, and current class index. This could
; be done with mapc and side effects, as in pick-best below. Which is
; uglier? Dunno, but at least this one has no setfs... Maybe I'll
; rewrite it with do*.)
(index
(first (reduce
; Compare current vector list with previous biggest. If
; same size, stick with the previous one.
#'(lambda (bestindex-bestsize-curindex vector-list)
(let ((bestindex (first bestindex-bestsize-curindex))
(bestsize (second bestindex-bestsize-curindex))
(curindex (third bestindex-bestsize-curindex))
(cursize (length vector-list)))
(if (> cursize bestsize)
; Found a bigger one. Make a new triple with its
; index and size, and increment the current index.
(list curindex cursize (1+ curindex))
; Keep the previous best, but bump the current index.
(list bestindex bestsize (1+ curindex)))))
; Vector lists
vector-lists
; Start with best size = -1 so it'll be replaced no matter
; what. Start with curindex = 0 because on each pass,
; curindex is the index of the vector list we're examining.
:initial-value (list -1 -1 0)))))
; Get the class name corresponding to the biggest vector list.
; Wrap it in quotes.
(make-quoted-string (nth index classes))))
; Given a vector, check if it has the specified value for the specified
; feature. (Recall feature values are bits in integers, with 1 representing
; true, and 0 representing NIL.) Return the vector if the value matches,
; else NIL.
(defun feature-has-value (vector index value)
(when (cmpbit index vector (bitvalue value)) vector))
; Given a vector list, apply feature-has-value to produce a list of vectors or
; NILs, where the vectors are those that had the right value (it's been
; stripped), and the NILs are in the places where there were vectors without
; the right value. Then remove the NILs.
(defun filter-one-list (vector-list index value)
(remove-if #'null
(mapcar #'(lambda (vector) (feature-has-value vector index value))
vector-list)))
; Given a feature index and a value, along with vector-lists, filter out
; vectors that don't have that value for the specified feature. Return a
; reduced vector-lists.
(defun filter-on-value (vector-lists index value)
; Remove vectors that don't have this value.
(mapcar
#'(lambda (vector-list) (filter-one-list vector-list index value))
vector-lists))
; For each value (T or NIL) of the given feature (denoted by its index), make
; a new vector-lists containing only the vectors with that value. Return the
; pair of vector-lists in a list with the T vector-lists first.
;
; If the feature is selected, these vector-lists will be used by the node
; generator to make nodes for each value.
(defun split-on-feature (vector-lists index)
; Split up vector-lists by starting with the whole thing for each value,
; then filtering out vectors that don't have that value. Resulting split
; vector-lists are in order with the list for value T first, then for NIL.
(mapcar #'(lambda (value)
(filter-on-value vector-lists index value)) '(T NIL)))
; Helper for use when we're in an end case, and need to decide whether to vote
; or use the supplied class-if-below-min value.
(defun vote-or-use-supplied-class (classes vector-lists class-if-below-min)
; If the user wants to vote, find the most populous class, else use their
; supplied class.
(if (equal class-if-below-min 'vote)
; Here if we want to vote.
(find-biggest-class classes vector-lists)
; Here to use the fixed class -- make a quoted string.
(make-quoted-string class-if-below-min)))
; Test for end case and return NIL if not an end case, or a string with the
; chosen class. We are at an end case if: 1) there are <= min_samples
; vectors total, 2) all samples have the same class, or 3) the tree level is
; deeper than max-depth.
(defun end-case (classes vector-lists
min-samples class-if-below-min max-depth depth)
; See if we've fallen below min-samples.
(if (< (total-vectors vector-lists) min-samples)
; Here, there are too few vectors to continue. Depending on
;class-if-below-min, take a vote or use the supplied class.
(vote-or-use-supplied-class classes vector-lists class-if-below-min)
; Here, we have enough vectors. Find out if they're all one class, and
; if so, return that class.
(if (= 1 (length (remove-if #'null vector-lists)))
; Here, there's just one class surviving. Get its index and pull out
; that class name. Wrap it in quotes.
(make-quoted-string (nth (position-if-not #'null vector-lists) classes))
; Here, we have more than enough vectors to proceed, and they're not
; all the same class. See if we want to truncate the tree due to depth,
; which we do if the user has supplied a non-NIL max-depth, and if we're
; deeper than that.
(if (and (not (null max-depth)) (> depth max-depth))
; Here, we've gone one level deeper than the cutoff. Use the same
; vote-or-fixed-class choice as above.
(vote-or-use-supplied-class classes vector-lists class-if-below-min)
; Here, we survived all the end-case checks -- say we're not done.
NIL))))
; Compute -(p log2 p) given number of vectors with one value, and total vectors.
; Define p log2 p to be 0 if p is 0.
(defun mplogp (num total)
(if (= 0 num)
0
(let ((fraction (/ num total)))
(- (* fraction (log fraction 2))))))
; Compute the entropy in a vector-lists. Recall it's split into classes.
; Sum -(p log2 p) over classes, where p = number of vectors in class / total
; number of vectors. !!! Find Quinlan's paper in which he discusses having more
; than two classes, to make sure there are no hidden gotchas. Since we've
; counted all the vectors, and we'll need that when we average over values,
; return that as well as the entropy.
(defun entropy (vector-lists)
(let* (; Get counts of vectors in each class.
(num-per-class (mapcar #'length vector-lists))
; Could call total-vectors to get this, but we've done half its work.
(total-vectors (reduce #'+ num-per-class))
; Form a list of -(p log2 p).
(mplogp-list (mapcar #'(lambda (num) (mplogp num total-vectors))
num-per-class)))
(list total-vectors (reduce #'+ mplogp-list))))
; Compute the information change for the specified feature. If we were to use
; this to compute the gain, we'd subtract it from the entropy at the current
; node. Since we're not bothering with the entropy at the current node, we just
; negate this. We call split-on-feature here, which makes the split vector
; lists we'll need if we pick this feature. So we return them too.
(defun info-change (vector-lists index)
(let* (; Split vectors on this feature.
(split-vector-lists (split-on-feature vector-lists index))
; Compute the entropy for each value, along with the number of vectors
; having that value. Each vector-lists represents a separate value.
(entropy-per-value (mapcar #'entropy split-vector-lists))
; Total number of vectors.
(total (total-vectors vector-lists))
; To average over the values, we include a portion of entropy for each
; value, weighted by the fraction of those values among the total.
; Recall our list of entropies has pairs with the count of vectors
; having a value, then the entropy for that value.
(fraction-of-entropy
(mapcar #'(lambda (e) (* (/ (first e) total) (second e)))
entropy-per-value))
; Sum the fractions to form the average entropy for this feature.
; Negate it so we can test for the highest value of info-change.
(info-change-value (- (reduce #'+ fraction-of-entropy))))
; Package up the info-change-value with the split lists.
(list info-change-value split-vector-lists)))
; Helper for make-node: Get the info and split vector lists for each feature.
; We have a list of index numbers for the remaining unused features, which
; we can mapcar through to examine the remaining features.
(defun get-info-and-split-vector-lists-list (features classes
feature-indices vector-lists)
; For each remaining feature, get the info and split vector lists. Return
; them all in a list.
(mapcar #'(lambda (index) (info-change vector-lists index))
feature-indices))
; Helper for make-node: Given a list of pairs of info and split vector lists
; for each feature, pick the best. Return its index and split vectors.
(defun pick-best (feature-indices info-and-split-vector-lists-list)
(let* (; Start with the first remaining feature as the "best so far".
(best-index (first feature-indices))
(best-info-and-split-vector-lists
(first info-and-split-vector-lists-list)))
; Step through other features, selecting the better of the current best
; and the next feature.
(mapc #'(lambda (next-index next-info-and-split-vector-lists)
; Compare current info-change with previous best.
; If better, replace it. (Info-change is the first item in
; info-and-split-vector-lists.)
(when (> (first next-info-and-split-vector-lists)
(first best-info-and-split-vector-lists))
(setf best-index next-index) ; Sometimes ya just gotta setf...
(setf best-info-and-split-vector-lists
next-info-and-split-vector-lists)))
; Tails of the supplied lists.
(rest feature-indices)
(rest info-and-split-vector-lists-list))
; Return the index of the best feature and its split vectors (remove the
; info value, because we don't need it once we've picked the best).
(list best-index (second best-info-and-split-vector-lists))))
; MAKE-NODE and MAKE-IF-STATEMENT *********************************************
;
; This pair constitutes the recursive operation that constructs the tree --
; they make a "node" and call themelves to make child nodes.
;
; Recall that we are constructing code, as a string. This has to work for any
; node, including the root. We have the feature list, class names, and the
; user's parameters, which are all static. What changes at each node are the
; lists of vectors in each class -- we have only the ones with the feature
; values along the path to the current node. In addition there are two
; bookkeeping values -- the depth (for checking against max-depth) and an
; indent string for pretty-printing.
;
; First, we check if we're at an end node. If we are, we return the class as
; a string -- our caller (us, since we're called recursively) will take care
; of whitespace if needed. Note that a bare value is fine as the body of a
; function -- if we're at the root, and there's just one class, then that's
; what we want to return when the run-time tree is used.
;
; If we're not at an end node, we extract the information we need to select
; the best feature, then make an "if" statement to test the value of that
; feature. In the "if" branches, we call ourself recursively.
;
; Arguments in common between make-node and make-if-statement:
;
; min-samples, class-if-below-min, max-depth -- see keyword args to dt-learn
; at top of this file
; depth = depth in decision tree of this node; root is 0
; indent-string = blanks to prepend to code, so it's pretty-printed
; feature-indices = indices of the remaining features -- those we have
; available to pick from at this node
;
; Other arguments to make-node:
;
; vector-lists = the entire vector-lists at this node, i.e.:
; (list vectors-in-class-1 vectors-in-class-2 ...)
;
; Other arguments to make-if-statement:
;
; feature = the selected feature to test at this node
; split-vector-lists = a list of vector-lists, one for each value of the given
; feature. (These are packaged in a list because in the
; general case, the feature might have an arbitrary number
; of values.)
; Form the "if" statement derived from the given feature, with each branch's
; contents generated by recursive calls to make node on each of a pair of
; vector-lists, one for each value of the feature, T or NIL.
(defun make-if-statement (features classes feature-indices
feature split-vector-lists
min-samples class-if-below-min max-depth
depth indent-string)
; Wrap an "if" statement around recursive calls to make-node. Note we
; increment the depth, decrement the number of features, and add some spaces
; to the indent. The "if" test uses contains-in-order to check if the
; supplied feature is in the sentence. The first if branch, for value T,
; uses the first split vector-lists, and the second branch gets the second
; vector-lists.
(concatenate 'string
; Start the "if" statement and its test, contains-in-order. First argument
; to contains-in-order is the sentence variable name, "sentence".
(princ-to-string #\newline)
indent-string
"(if (contains-in-order sentence "
; Second argument to contains-in-order is the feature. We have a list of
; words, where the words are strings. We need a text string of a list
; of those words, where the words are quoted.
(make-string-of-list feature)
; Close off the test.
")"
; Add the first (T) branch.
(princ-to-string #\newline)
(make-node features classes feature-indices (first split-vector-lists)
min-samples class-if-below-min max-depth
depth indent-string)
; Add the second (NIL) branch.
(princ-to-string #\newline)
(make-node features classes feature-indices (second split-vector-lists)
min-samples class-if-below-min max-depth
depth indent-string)
; Close off the if.
")"))
(defun make-node (features classes feature-indices vector-lists
min-samples class-if-below-min max-depth
depth indent-string)
(let (; Get the class if this should be an end node, or NIL if not.
(end-class (end-case classes vector-lists
min-samples class-if-below-min max-depth depth))
; We'll need the deeper indent for both if branches, so make it here.
(next-indent-string (concatenate 'string " " indent-string)))
(if end-class
; Here if we're at an end node -- just return the class string.
(concatenate 'string next-indent-string end-class)
; Here if we have more work to do... We want to pick the best feature
; to split on.
(let* (; For each feature, calculate the info-gain and get the
; split lists.
(info-and-split-vector-lists-list
(get-info-and-split-vector-lists-list
features classes feature-indices vector-lists))
; Select the feature with the highest info-gain.
(best-index-and-split-vector-lists
(pick-best feature-indices info-and-split-vector-lists-list))
(best-index (first best-index-and-split-vector-lists))
(split-vector-lists (second best-index-and-split-vector-lists))
; Extract the best feature out of the features list.
(feature (nth best-index features))
; Remove the chosen feature from the feature index list.
; (Warning! Do not attempt to setf this to save space. Each
; path gets its own variant depending on the features previously
; selected along that path.)
(reduced-feature-indices (remove best-index feature-indices)))
; Make the "if" statement for this feature. (Since we removed the index
; for the chosen feature from feature-indices, we may as well update
; the other values that will need to be adjusted for the new tree level
; (depth and indent-string) here, rather than leaving some for
; make-if-statement to do.)
(make-if-statement features classes reduced-feature-indices
feature split-vector-lists
min-samples class-if-below-min max-depth
(1+ depth)
next-indent-string)))))
; DT-LEARN ********************************************************************
;
; This is the top-level call. It does a sanity-check on the arguments; makes
; the initial data-lists (consisting of the feature list, class name list, and
; vector lists for each class) by calling read-dataset and make-data-lists;
; calls make-node to get the decision tree code string as a big "if" statement;
; adds a "defun" wrapper and a "compile" on the end, and writes the whole mess
; out.
(defun dt-learn (dataset-files excluded-words
&key (min-samples 5) (class-if-below-min 'vote)
(max-depth NIL) (output-file "dt-perform.lsp"))
(let* (
; Sanity checks: We can't allow class-if-below-min to be NIL, nor
; can we allow min-samples to be zero, nor max-depth to be anything
; but a positive integer or NIL, nor training-fraction (if we were
; using it) to be outside of (0,1), not inclusive.
(safe-min-samples (if (= 0 min-samples) 1 min-samples))
(safe-class-if-below-min (if (null class-if-below-min)
'vote class-if-below-min))
(safe-max-depth (when (or (null max-depth)
(and (typep max-depth 'number)
(plusp max-depth)))
max-depth))
; Read in the data. For now, the user should split off the training
; data and pass just that to us.
(data-lists (make-data-lists (read-dataset dataset-files)
excluded-words))
; Extract features, classes, vectors lists from the data-lists.
(features (get-features data-lists))
(classes (get-classes data-lists))
(vector-lists (get-vector-lists data-lists))
; Find out how many features are there are by looking at the size of
; the feature list. Then make a list of indices for all features.
; As we use a feature, we'll remove its index from the list.
(feature-indices (make-index-list (length features)))
; Generate a string of code. Concatenate pieces that are produced
; separately.
(code
(concatenate 'string
; Start off with a defun.
(princ-to-string #\newline)
"(defun dt-perform (sentence)"
(princ-to-string #\newline)
; Generate the big conditional representing the tree. It returns
; a string with the name of the class.
(make-node features classes feature-indices vector-lists
safe-min-samples safe-class-if-below-min
safe-max-depth 0 "")
; Close off the defun and add a compile.
")"
(princ-to-string #\newline)
(princ-to-string #\newline)
"(compile 'dt-perform)"
(princ-to-string #\newline))))
; Done with the code string (concatenate was closed just above).
; Write it out. Use "~A" not "~S" so that quotes don't get escaped.
(with-open-file (output-stream output-file :direction :output)
(format output-stream "~A" code))))
; TESTING *********************************************************************
; Provide a function that runs dt-perform on a set of test examples and reports
; how many were right out of what total. We expect that dt-perform has
; already been loaded. For now, caller must separate test from training
; examples, and pass in only test examples.
(defun dt-test (dataset-files)
(let* (; Read in the test data. Pull out the class list and example lists.
; (Note we stick with sentences here -- no conversion to vectors.)
(data (read-dataset dataset-files))
(classes (first data))
(sentence-lists (rest data))
; For each class, run the sentences through dt-perform. Collect a
; list of classes.
(result-lists (mapcar #'(lambda (sentence-list)
(mapcar #'dt-perform sentence-list))
sentence-lists))
; Filter out the ones that don't match their true classes.
(correct-lists (mapcar #'(lambda (result-list class)
(remove-if-not #'(lambda (result)
(equal result class))
result-list))
result-lists
classes))
; Count the ones that match their true classes. At the end, sum them.
(correct-counts (mapcar #'length correct-lists))
(correct-total (reduce #'+ correct-counts))
; Total sentences in the test set.
(num-vectors (total-vectors sentence-lists)))
; Show results.
(format t "~%Number correct ~D out of ~D~%" correct-total num-vectors)))