forked from microsoft/Optimal-Freshness-Crawl-Scheduling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLambdaCrawlExps.py
1080 lines (895 loc) · 64.4 KB
/
LambdaCrawlExps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import numpy as np;
import scipy as sp;
from scipy import optimize;
import math;
import time;
import sys;
import scipy.stats
import json
from operator import itemgetter
# This file contains the code for running the experiments in the paper
#
# A. Kolobov, Y. Peres, C. Lu, E. Horvitz. "Staying up to Date with Online Content Changes Using Reinforcement Learning for Scheduling." NeurIPS-2019.
epsilon_learn_global = 0.0001
# Learns sources' Poisson change rates from observation histories.
#
# The curr_time parameter is supposed to be a time relative to the same timestamp as the timestamps in histories. The suffix_length parameter is the
# length of history suffix immediately preceding curr_time telling this method to use only thehistory interval from [curr_time - suffix, curr_time]
# for learning the change rates.
def LearnChRates(observations_incompl_obs, ch_rates_incompl_obs_est, observations_compl_obs, \
ch_rates_compl_obs_est, epsilon_learn, suffix_length, curr_time):
for w in range(len(ch_rates_incompl_obs_est)):
ch_rates_incompl_obs_est[w] = LearnChRates_IncomplObs(observations_incompl_obs[w], suffix_length, curr_time, epsilon_learn)
for w in range(len(ch_rates_compl_obs_est)):
ch_rates_compl_obs_est[w] = LearnChRates_ComplObs(observations_compl_obs[w], suffix_length, curr_time)
# Learns a sources' Poisson change rate from the sources' observation history assuming complete change observations.
#
# The training_data parameter is an array of timestamps when changes were detected, sorted in the increasing order. The timestamps are assumed to be
# relative to some time point in the past. The suffix_length parameter is the length of history suffix immediately preceding curr_time telling this
# method to use only thehistory interval from [curr_time - suffix, curr_time] for learning the change rates. The method uses Equation 11 from the
# NeurIPS-2019 paper for learning the change rate.
def LearnChRates_ComplObs(training_data, suffix_length, curr_time):
# We use a type of smoothing whereby we assume we start with some number (not necessarily an integer) of imaginary change observations over an
# imaginary period of time. Their values we used for the experiments in the NeurIPS-2019 paper are below.
smoothing_count_term = 0.5
smoothing_interval_term = 0.5
total_changes = 0
time_so_far = 0
i = len(training_data) - 1
while (i >= 0 and training_data[i] > curr_time):
i -= 1
if (i < 0):
# the observation history before curr_time is empty
return smoothing_count_term / (smoothing_interval_term + min(curr_time, suffix_length))
elif (curr_time - training_data[i] > suffix_length):
# there were no observations in the time interval [curr_time - suffix_length, curr_time]
return smoothing_count_term / (smoothing_interval_term + suffix_length)
else:
time_so_far = curr_time - training_data[i]
while (time_so_far < suffix_length and i >= 0):
# We assume we are fresh w.r.t. every source when we start crawling
previous_interval = (training_data[i] - training_data[i - 1]) if (i > 0) else training_data[i]
total_changes += 1
time_so_far = min(time_so_far + previous_interval, suffix_length)
i -= 1
return ((total_changes + smoothing_count_term) / (time_so_far + smoothing_interval_term))
# Learns a source's Poisson change rate from the source's observation history assuming incomplete change observations.
#
# The training_data parameter is a 2D array of history data for a given source, where training_data[i][0] is the timestamp of the i-th crawl happened,
# assumed to be relative to some time point in the past, and training_data[i][0] is a 0/1 flag indicating where that crawl detected any changes. The
# suffix_length parameter is the length of history suffix immediately preceding curr_time telling this method to use only the history interval from
# [curr_time - suffix, curr_time] for learning the change rates. The method uses Equation 10 from the NeurIPS-2019 paper for learning the change rate.
def LearnChRates_IncomplObs(training_data, suffix_length, curr_time, epsilon):
# We use a type of smoothing whereby we assume we start with some number of inter-crawl intervals (not necessarily an integer) of various lengths
# that detected a change and some number that didn't. We used one interval of each kind, whose length in both cases is the same, shown below.
smoothing_term = 0.5
time_so_far = 0
# Initialize the sets of inter-observation intervals assuming we had one imaginary interval after which we observed
# a change and one imaginary interval after which we didn't, with the length of either interval equal to
# the smoothing term.
list_ch_training_intervals = [smoothing_term]
list_no_ch_training_intervals = [smoothing_term]
i = len(training_data) - 1
while (i >= 0 and training_data[i][0] > curr_time):
i -= 1
if (i < 0):
# The observation history before curr_time is empty. Return the change rate estimate
# assuming we had one imaginary interval after which we observed a change and
# one imaginary interval after which we didn't, with the length of either interval equal to
# the smoothing term.
return math.log(2) / smoothing_term
# Count back the history suffix starting with the time of the latest observation (crawl) that happened before curr_time. We can't use the time
# interval between the latest crawl and the current time for training purposes, because we haven't collected any training data (observations)
# in that interval, and hence, due to the incompleteness of our observation history, don't know whether any changes have happened since the
# latest observation
while (time_so_far < suffix_length and i >= 0):
previous_interval = (training_data[i][0] - training_data[i - 1][0]) if (i > 0) else training_data[i][0]
# Note that after the next line, it may be that time_so_far + previous_interval > suffix_length.
# This is fine -- to make learning less biased, we need to either use the last observation that falls into
# [reference_time - suffix_length, reference_time] *and* the *entire* intra-observation interval
# preceding it, or use neither that observation nor that interval. We choose to do the former.
if training_data[i][1] == 1:
list_ch_training_intervals.append(previous_interval)
else:
list_no_ch_training_intervals.append(previous_interval)
time_so_far += previous_interval
i -= 1
max_ch_interval = max(list_ch_training_intervals)
min_ch_interval = min(list_ch_training_intervals)
sum_ch_intervals = sum(list_ch_training_intervals)
sum_no_ch_intervals = sum(list_no_ch_training_intervals)
# Based on the MLE for incomplete observation history, it's easy to verify that the bounds always bracket
# the change rate estimate. However, they affect the exponent of e during learning, and when they are
# too loose, cause numerical errors. Therefore, we bound the range of change rate's values based on reasonable
# guesses based on prior knowledge.
ch_rate_lb = 0.0005 # math.log(sum_ch_intervals / sum_no_ch_intervals + 1) / max_ch_interval;
ch_rate_ub = 7 # math.log(sum_ch_intervals / sum_no_ch_intervals + 1) / min_ch_interval;
return BisectionSearchMonotoneDecr(DataLLDerivative, (list_ch_training_intervals, list_no_ch_training_intervals), ch_rate_lb, ch_rate_ub, epsilon)
# Computes the data log likelihood of a history of incomplete change observations under a given Poisson change rate.
def DataLLDerivative(params, ch_rate):
term_with_changes = 0
term_without_changes = 0
list_ch_training_intervals = params[0]
list_no_ch_training_intervals = params[1]
for interval in list_ch_training_intervals:
term_with_changes += (interval / (math.exp(ch_rate * interval) - 1))
for interval in list_no_ch_training_intervals:
term_without_changes += interval
return (term_with_changes - term_without_changes)
# Evaluates the Lagrangian of the negative harmonic policy cost under for a given lambda.
#
# See Proposition 2 and equation system 5 in the NeurIPS-2019 paper for details.
def PolicyValueLagr_NoObs(params, lambd):
sum = 0
importances = params[0]
ch_rates = params[1]
bandwidth = params[2]
# We are assuming importances and cr_rates have the same length
for i in range(len(importances)):
sum += (-ch_rates[i] + math.sqrt(ch_rates[i]**2 + 4 * importances[i] * ch_rates[i] / lambd)) / 2
return sum - bandwidth
# Bisection search for the root of a 1-variable function parameterized by known parameter values (param) and assumed to be monotonically decreasing
# on the interval [var_lb_init, var_ub_init].
#
# The assumption that the function is monotonically decreasing knowledge lets us save one function evaluation.
def BisectionSearchMonotoneDecr(func, params, var_lb_init, var_ub_init, epsilon):
var_lb = var_lb_init
var_ub = var_ub_init
var_center = (var_ub + var_lb) / 2
num_it = 0
while ((var_ub - var_lb) / 2 > epsilon):
num_it += 1
func_val = func(params, var_center)
if (func_val == 0):
return var_center
elif (func_val > 0):
var_lb = var_center
else:
var_ub = var_center
var_center = (var_ub + var_lb) / 2
return var_center
# LambdaCrawl for sources with incomplete change observations.
#
# See the pseudocode in Algorithm 1 in the NeurIPS-2019 paper for details.
def LambdaCrawl_IncomplObs(importances, ch_rates, bandwidth, epsilon):
if (bandwidth == 0 or importances.shape[0] == 0):
return np.zeros_like(importances, dtype=float)
min_importance = min(importances)
max_importance = max(importances)
min_ch_rate = min(ch_rates)
max_ch_rate = max(ch_rates)
lambda_lb = len(importances)**2 * min_importance * min_ch_rate / (len(importances) * max_ch_rate * bandwidth + bandwidth**2)
lambda_ub = len(importances)**2 * max_importance * max_ch_rate / (len(importances) * min_ch_rate * bandwidth + bandwidth**2)
lambd = BisectionSearchMonotoneDecr(PolicyValueLagr_NoObs, (importances, ch_rates, bandwidth), lambda_lb, lambda_ub, epsilon)
crawl_rates = np.zeros_like(importances, dtype=float)
for i in range(len(crawl_rates)):
crawl_rates[i] = (-ch_rates[i] + math.sqrt(ch_rates[i]**2 + 4 * importances[i] * ch_rates[i] / lambd)) / 2
return crawl_rates
# LambdaCrawl approximation for sources with incomplete change observations that assumes importance_w/change_rate_w = c for a fixed (unknown) c
# for all sources w.
#
# See Proposition 9 in the NeurIPS-2019 paper for details.
def LambdaCrawlApprox_IncomplObs(importances, ch_rates, bandwidth, epsilon):
crawl_rates = np.zeros_like(importances, dtype=float)
sum_imp = sum(importances)
for w in range(len(importances)):
crawl_rates[w] = importances[w] * bandwidth / sum_imp
return crawl_rates
# LambdaCrawl for sources with complete change observations.
#
# See the pseudocode in Algorithm 2 in the NeurIPS-2019 paper for details.
def LambdaCrawl_ComplObs(importances, ch_rates, bandwidth):
if (bandwidth == 0):
return np.zeros_like(importances, dtype=float)
crawl_probs = np.zeros_like(importances, dtype=float)
crawl_probs_approx = np.empty_like(importances, dtype=float)
remaining_bandwidth = bandwidth
num_it = 0
while True:
num_it += 1
saturated_a_constraint = False
# For the calculation of the denominator later in this loop, we will need the sum of importances, *but only of those sources
# for which we haven't determined crawl_probs[w] = 1 in previous iterations*. Sources for which we have determined this have
# been excluded from consideration entirely.
sum_importances = 0
for w in range(len(importances)):
if crawl_probs[w] == 0:
sum_importances += importances[w]
for w in range(len(importances)):
# under any acceptable solution, the crawl probability of every source is > 0, so crawl_probs[i] = 0
# meanst that we haven't determined the final value for this source yet
if crawl_probs[w] == 0:
p_hat_w = bandwidth * importances[w] / (ch_rates[w] * sum_importances)
# if the probability constraint p_w <= 1 is saturated for source w...
if (p_hat_w >= 1.0):
crawl_probs[w] = 1.0
# indicate that we don't need to reconsider the crawl probability value of source w in subsequent iterations
crawl_probs_approx[w] = -1
remaining_bandwidth -= (crawl_probs[w] * ch_rates[w])
saturated_a_constraint = True
else:
crawl_probs_approx[w] = p_hat_w
# if we didn't saturate any constraints in the last iteration, the non-negative values in
# crawl_probs_approx are the final crawl probability values of the corresponding sources
if saturated_a_constraint == False:
for w in range(len(importances)):
if (crawl_probs_approx[w] != -1):
crawl_probs[w] = crawl_probs_approx[w]
break
# otherwise, repeat with remaining bandwidth and sources that don't have crawl_probs_approx[w] = -1 yet
else:
bandwidth = remaining_bandwidth
return crawl_probs
# Computes the harmonic policy cost for URLs with incomplete observations.
#
# See Proposition 1/Equation 4 in the NeurIPS-2019 paper.
def HarmonicPolicyCost_IncomplObs(importances, ch_rates, crawl_rates):
if (len(importances) == 0):
return 0
cost_incompl_obs = 0
for w in range(len(importances)):
if crawl_rates[w] == 0:
cost_incompl_obs = math.inf
break
cost_incompl_obs -= importances[w] * math.log(crawl_rates[w] / (crawl_rates[w] + ch_rates[w]))
return cost_incompl_obs / len(importances)
# Computes the harmonic policy cost for URLs with complete observations.
#
# See the formula in Proposition 4 in the NeurIPS-2019 paper.
def HarmonicPolicyCost_ComplObs(importances, ch_rates, crawl_probs):
if (len(importances) == 0):
return 0
cost_compl_obs = 0
for w in range(len(importances)):
if crawl_probs[w] == 0:
cost_compl_obs = math.inf
break
cost_compl_obs -= importances[w] * math.log(crawl_probs[w])
return cost_compl_obs / len(importances)
# Computes the harmonic policy cost for a mix of URLs with complete and incomplete observations.
#
# See the Equation 8 in the NeurIPS-2019 paper.
def HarmonicPolicyCost(crawl_rates_incompl_obs, crawl_probs_compl_obs, importances_incompl_obs, ch_rates_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs):
cost_incompl_obs = HarmonicPolicyCost_IncomplObs(importances_incompl_obs, ch_rates_incompl_obs, crawl_rates_incompl_obs)
cost_compl_obs = HarmonicPolicyCost_ComplObs(importances_compl_obs, ch_rates_compl_obs, crawl_probs_compl_obs)
return ((len(importances_incompl_obs) * cost_incompl_obs + len(importances_compl_obs) * cost_compl_obs) \
/ (len(importances_incompl_obs) + len(importances_compl_obs)))
# Computes the binary policy cost for URLs with incomplete observations.
#
# See Equation 12 in the NeurIPS-2019 paper's supplement.
def BinaryPolicyCost_IncomplObs(importances, ch_rates, crawl_rates):
if (len(importances) == 0):
return 0
cost_incompl_obs = 0
for w in range(len(importances)):
cost_incompl_obs += importances[w] * ch_rates[w] / (crawl_rates[w] + ch_rates[w])
return cost_incompl_obs / len(importances)
# Computes the binary policy cost for URLs with complete observations.
#
# See Equation 13 in the NeurIPS-2019 paper's supplement.
def BinaryPolicyCost_ComplObs(importances, ch_rates, crawl_probs):
if (len(importances) == 0):
return 0
cost_compl_obs = 0
for w in range(len(importances)):
cost_compl_obs += importances[w] * (1 - crawl_probs[w])
return cost_compl_obs / len(importances)
# Computes the binary policy cost for a mix of URLs with complete and incomplete observations.
def BinaryPolicyCost(crawl_rates_incompl_obs, crawl_probs_compl_obs, importances_incompl_obs, ch_rates_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs):
cost_incompl_obs = BinaryPolicyCost_IncomplObs(importances_incompl_obs, ch_rates_incompl_obs, crawl_rates_incompl_obs)
cost_compl_obs = BinaryPolicyCost_ComplObs(importances_compl_obs, ch_rates_compl_obs, crawl_probs_compl_obs)
return ((len(importances_incompl_obs) * cost_incompl_obs + len(importances_compl_obs) * cost_compl_obs) \
/ (len(importances_incompl_obs) + len(importances_compl_obs)))
# Computes the harmonic policy cost for a given bandwidth split across complete- and incomplete-change-history URLs.
#
# See Algorithm 3 in the NeurIPS-2019 paper. The return of the function in the pseudocode is the negative of SplitEval_JStar.
def SplitEval_JStar(bandwidth_compl_obs, solver_x_incompl_obs, importances_incompl_obs, ch_rates_incompl_obs, epsilon_incompl_obs, importances_compl_obs, ch_rates_compl_obs, bandwidth):
if (bandwidth_compl_obs > bandwidth):
raise ValueError('SplitEval_JStar ERROR: bandwidth allocation to sources with complete observations exceeds total bandwidth! Bandwidth allocation to sources with complete observations: ', bandwidth_compl_obs, ", total bandwidth: ", bandwidth)
crawl_rates_incompl_obs = solver_x_incompl_obs(importances_incompl_obs, ch_rates_incompl_obs, bandwidth - bandwidth_compl_obs, epsilon_incompl_obs)
J_incompl_obs = 0
for w in range(len(importances_incompl_obs)):
if crawl_rates_incompl_obs[w] == 0:
J_incompl_obs = math.inf
break
else:
J_incompl_obs -= (importances_incompl_obs[w] * math.log(crawl_rates_incompl_obs[w] / (crawl_rates_incompl_obs[w] + ch_rates_incompl_obs[w])))
crawl_probs_compl_obs = LambdaCrawl_ComplObs(importances_compl_obs, ch_rates_compl_obs, bandwidth_compl_obs)
J_compl_obs = 0
for w in range(len(importances_compl_obs)):
if crawl_probs_compl_obs[w] == 0:
J_compl_obs = math.inf
break
else:
J_compl_obs -= (importances_compl_obs[w] * math.log(crawl_probs_compl_obs[w]))
return J_incompl_obs + J_compl_obs
# Implements the LambdaCrawl family of algorithms.
#
# See Algorithm 3 in the NeurIPS-2019 paper. The implementation can use either the optimal LambdaCrawlApprox_IncomplObs or the approximate
# LambdaCrawlApprox_IncomplObs for handling the incomplete-change-observation sources.
def LambdaCrawl_X(solver_x_incompl_obs, importances_incompl_obs, ch_rates_incompl_obs, epsilon_incompl_obs, importances_compl_obs, ch_rates_compl_obs, bandwidth):
# We use a minimization routine here, so SplitEval_JStar returns the value of J* for a given split, _not_ of \overline{J}^* = -J^*
# as in LambdaCrawl's description in the paper.
result = sp.optimize.minimize_scalar(SplitEval_JStar, bounds=(0, min(bandwidth, sum(ch_rates_compl_obs))), \
args=(solver_x_incompl_obs, importances_incompl_obs, ch_rates_incompl_obs, epsilon_incompl_obs, importances_compl_obs, ch_rates_compl_obs, bandwidth), \
method='bounded', options={'xatol': 0.005 * bandwidth})
if result.success:
bandwidth_compl_obs = result.x
crawl_rates_incompl_obs = solver_x_incompl_obs(importances_incompl_obs, ch_rates_incompl_obs, bandwidth - bandwidth_compl_obs, epsilon_incompl_obs)
crawl_probs_compl_obs = LambdaCrawl_ComplObs(importances_compl_obs, ch_rates_compl_obs, bandwidth_compl_obs)
return (crawl_rates_incompl_obs, crawl_probs_compl_obs)
else:
raise ValueError('LambdaCrawl ERROR: bounded minimization failed')
# Implements LambdaCrawl proper. See LambdaCrawl_X for details.
def LambdaCrawl(importances_incompl_obs, ch_rates_incompl_obs, epsilon_incompl_obs, importances_compl_obs, ch_rates_compl_obs, bandwidth):
return LambdaCrawl_X(LambdaCrawl_IncomplObs, importances_incompl_obs, ch_rates_incompl_obs, epsilon_incompl_obs, importances_compl_obs, ch_rates_compl_obs, bandwidth)
# Implements LambdaCrawlApprox, i.e., LambdaCrawl that uses the approximation from Proposition 9 in the NeurIPS-2019 paper to handle the
# incomplete-change-observation sources. See LambdaCrawl_X for details.
def LambdaCrawlApprox(importances_incompl_obs, ch_rates_incompl_obs, epsilon_incompl_obs, importances_compl_obs, ch_rates_compl_obs, bandwidth):
return LambdaCrawl_X(LambdaCrawlApprox_IncomplObs, importances_incompl_obs, ch_rates_incompl_obs, epsilon_incompl_obs, importances_compl_obs, ch_rates_compl_obs, bandwidth)
# Implements the BinaryLambdaCrawl family of algorithms.
#
# This is a generalization of the algorithm from Y. Azar, E. Horvitz, E. Lubetzky, Y. Peres, D. Shahaf. "Tractable near-optimal policies for crawling."
# PNAS-2018, which the NeurIPS-2019 paper refers to as BinaryLambdaCrawl. That algorithm optimizes the binary policy cost (see that paper for
# details) and as a result may fail to allocate any crawl rate to some of the sources. BinaryLambdaCrawl(epsilon) (see Subsection 9.3 in the
# supplement of the NeurIPS-2019 paper) is a modification of BinaryLambdaCrawl that forces it to allocate some bandwidth even to pages that
# BinaryLambdaCrawl would otherwise crawl-starve. In this family, BinaryLambdaCrawl(0.0) corresponds to the original BinaryLambdaCrawl.
# BinaryLambdaCrawl(0.4) has the best performance on the NeurIPS-2019 paper's dataset w.r.t. the harmonic policy cost, of all
# BinaryLambdaCrawl(epsilon) with epsilon in {0.0, 0.1,...,1}.
#
# The wrapper handles sources with complete and incomplete change observations. BinaryLambdaCrawl doesn't know how to handle the former in any special
# way; it simply treats them as if their observation history was incomplete. See LambdaCrawlBinary_Epsilon_Helper for most of this algorithm's logic.
def LambdaCrawlBinary_Epsilon(importances_incompl_obs, ch_rates_incompl_obs, epsilon, importances_compl_obs, ch_rates_compl_obs, bandwidth):
if (len(importances_compl_obs) > 0):
sys.exit("ERROR: LambdaCrawlBinary_Epsilon doesn't know how to handle complete observation histories, but importances_compl_obs is nonempty")
imps_and_chrates = np.column_stack((importances_incompl_obs, ch_rates_incompl_obs))
crawl_rates = np.zeros_like(imps_and_chrates[:,0], dtype=float)
min_crawl_rate = bandwidth / imps_and_chrates.shape[0] * epsilon
LambdaCrawlBinary_Epsilon_Helper(imps_and_chrates, crawl_rates, bandwidth, min_crawl_rate)
return crawl_rates, []
# The main part of BinaryLambdaCrawl implementation.
#
# See Section 9.3 of the NeurIPS-2019 paper's supplement and Y. Azar, E. Horvitz, E. Lubetzky, Y. Peres, D. Shahaf. "Tractable near-optimal policies
# for crawling." PNAS-2018 for details.
def LambdaCrawlBinary_Epsilon_Helper(imps_and_chrates, crawl_rates, bandwidth, min_crawl_rate):
idxs_and_value_ratios = []
for w in range(imps_and_chrates.shape[0]):
idxs_and_value_ratios.append((imps_and_chrates[w, 0] * imps_and_chrates[w, 1] / (imps_and_chrates[w, 1] + min_crawl_rate)**2, w))
r = 0
for w in range(len(idxs_and_value_ratios)):
r += math.sqrt(imps_and_chrates[w, 0] * imps_and_chrates[w, 1])
s = sum(imps_and_chrates[:, 1])
idxs_and_value_ratios.sort(key=itemgetter(0))
rem_bandwidth = bandwidth
for w in range(len(idxs_and_value_ratios)):
if (imps_and_chrates[idxs_and_value_ratios[w][1], 0] * imps_and_chrates[idxs_and_value_ratios[w][1], 1] \
/ (imps_and_chrates[idxs_and_value_ratios[w][1], 1] + min_crawl_rate)**2 <= (r / (rem_bandwidth + s ))**2):
r -= math.sqrt(imps_and_chrates[idxs_and_value_ratios[w][1], 0] \
* imps_and_chrates[idxs_and_value_ratios[w][1], 1])
s -= imps_and_chrates[idxs_and_value_ratios[w][1], 1]
crawl_rates[idxs_and_value_ratios[w][1]] = min_crawl_rate
rem_bandwidth -= crawl_rates[idxs_and_value_ratios[w][1]]
else:
# NOTE: this clause kicks in at every iteration after some iteration M. It doesn't alternate with the clause above.
crawl_rates[idxs_and_value_ratios[w][1]] = \
math.sqrt(imps_and_chrates[idxs_and_value_ratios[w][1], 0] \
* imps_and_chrates[idxs_and_value_ratios[w][1], 1]) \
* (rem_bandwidth + s) / r - imps_and_chrates[idxs_and_value_ratios[w][1], 1]
# Implements the RL version of the LambdaCrawl family of algorithms.
#
# See Algorithm 4 in the NeurIPS-2019 paper. The implementation can use either LambdaCrawl itself, or LambdaCrawlApprox, its approximate version,
# or LambdaCrawlBinary that minimizes binary policy cost as solver_x, the algorithm for handling incomplete-change-observation sources.
def LambdaLearnAndCrawl_X(solver_x, importances_incompl_obs, ch_rates_incompl_obs_actual, epsilon_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs_actual, bandwidth, epsilon_learn, epoch_length, suffix_len, horizon):
start_total = time.time()
curr_time = 0
changes_incompl_obs = np.empty(len(importances_incompl_obs), dtype=np.object)
for i in range(len(changes_incompl_obs)): changes_incompl_obs[i] = []
crawls_incompl_obs = np.empty(len(importances_incompl_obs), dtype=np.object)
for i in range(len(crawls_incompl_obs)): crawls_incompl_obs[i] = []
observations_incompl_obs = np.empty(len(importances_incompl_obs), dtype=np.object)
for i in range(len(observations_incompl_obs)): observations_incompl_obs[i] = []
changes_compl_obs = np.empty(len(importances_compl_obs), dtype=np.object)
for i in range(len(changes_compl_obs)): changes_compl_obs[i] = []
crawls_compl_obs = np.empty(len(importances_compl_obs), dtype=np.object)
for i in range(len(crawls_compl_obs)): crawls_compl_obs[i] = []
observations_compl_obs = changes_compl_obs # since the observation sequence is complete, we observe every change
ch_rates_incompl_obs_est = [1.0] * len(ch_rates_incompl_obs_actual)
ch_rates_compl_obs_est = [1.0] * len(ch_rates_compl_obs_actual)
idealized_policy_costs_harmonic = []
idealized_policy_costs_binary = []
while curr_time < horizon:
start_sch = time.time()
# Compute the optimal policy parameters given the current change rate estimates.
crawl_rates_incompl_obs, crawl_probs_compl_obs = solver_x(importances_incompl_obs, ch_rates_incompl_obs_est, \
epsilon_incompl_obs, importances_compl_obs, ch_rates_compl_obs_est, bandwidth)
# Apply Propositions 1 & 4 from the NeurIPS-2019 paper to find the (asymptotic) harmonic policy cost.
idealized_policy_cost_harmonic = HarmonicPolicyCost(crawl_rates_incompl_obs, crawl_probs_compl_obs, importances_incompl_obs,\
ch_rates_incompl_obs_actual, importances_compl_obs, ch_rates_compl_obs_actual)
# Apply Equations 12 & 13 from the NeurIPS-2019 paper's supplement to find the (asymptotic) binary policy cost.
idealized_policy_cost_binary = BinaryPolicyCost(crawl_rates_incompl_obs, crawl_probs_compl_obs, importances_incompl_obs,\
ch_rates_incompl_obs_actual, importances_compl_obs, ch_rates_compl_obs_actual)
print("Idealized harmonic policy cost: ", idealized_policy_cost_harmonic)
idealized_policy_costs_harmonic.append(idealized_policy_cost_harmonic)
idealized_policy_costs_binary.append(idealized_policy_cost_binary)
end_sch = time.time()
start_ext = time.time()
# Advance the simulation time by until the horizon: simulate source changes for sources of both types until the horizon, then, given these
# changes, simulate crawls from the current policy using policy parameters computed above, and finally generate observations until the
# horizon, given the generated changes and crawls. Note that for extending the observation history of complete-observation
# sources, the scheduling policy doesn't matter.
ExtendChangeHistory(changes_incompl_obs, ch_rates_incompl_obs_actual, curr_time, epoch_length)
ExtendChangeHistory(changes_compl_obs, ch_rates_compl_obs_actual, curr_time, epoch_length)
ExtendCrawlHistory_IncomplObs(crawls_incompl_obs, crawl_rates_incompl_obs, curr_time, epoch_length)
ExtendCrawlHistory_ComplObs(changes_compl_obs, crawls_compl_obs, crawl_probs_compl_obs, curr_time, epoch_length)
ExtendObsHistory_IncomplObs(changes_incompl_obs, crawls_incompl_obs, observations_incompl_obs, curr_time, epoch_length)
end_ext = time.time()
curr_time = min(curr_time + epoch_length, horizon)
start_learn = time.time()
# Re-learn the change rates from the extended observation data. There is actually no need to learn for the incomplete-observation sources
# in case we use LambdaLearnAndCrawlApprox, but we learn them anyway.
LearnChRates(observations_incompl_obs, ch_rates_incompl_obs_est, observations_compl_obs, ch_rates_compl_obs_est, \
epsilon_learn, math.inf, curr_time)
end_learn = time.time()
#print('\tLearning took {} seconds'.format(end_learn - start_learn))
end_total = time.time()
total_time = end_total - start_total
print('RL took {} seconds in total'.format(total_time))
policy_cost = EvalMixedTrace(importances_incompl_obs, changes_incompl_obs, crawls_incompl_obs, importances_compl_obs, changes_compl_obs, \
crawls_compl_obs, horizon)
return policy_cost, total_time, idealized_policy_costs_harmonic, idealized_policy_costs_binary
# Implements LambdaLearnAndCrawl. See LambdaLearnAndCrawl_X for details.
def LambdaLearnAndCrawl(importances_incompl_obs, ch_rates_incompl_obs_actual , epsilon_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs_actual, bandwidth, epsilon_learn, epoch_length, suffix_len, horizon):
print("Running LambdaLearnAndCrawl...")
return LambdaLearnAndCrawl_X(LambdaCrawl, importances_incompl_obs, ch_rates_incompl_obs_actual , epsilon_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs_actual, bandwidth, epsilon_learn, epoch_length, suffix_len, horizon)
# Implements LambdaLearnAndCrawlApprox. See LambdaLearnAndCrawl_X for details.
def LambdaLearnAndCrawlApprox(importances_incompl_obs, ch_rates_incompl_obs_actual , epsilon_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs_actual, bandwidth, epsilon_learn, epoch_length, suffix_len, horizon):
print("Running LambdaLearnAndCrawlApprox...")
return LambdaLearnAndCrawl_X(LambdaCrawlApprox, importances_incompl_obs, ch_rates_incompl_obs_actual , epsilon_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs_actual, bandwidth, epsilon_learn, epoch_length, suffix_len, horizon)
# Implements LambdaLearnAndCrawlBinary. See LambdaLearnAndCrawl_X for details.
def LambdaLearnAndCrawlBinary(importances_incompl_obs, ch_rates_incompl_obs_actual , epsilon_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs_actual, bandwidth, epsilon_learn, epoch_length, suffix_len, horizon):
print("Running LambdaLearnAndCrawlBinary...")
# LambdaCrawlBinary doesn't know how to deal with pages with complete change observations, so put all pages into the "incomplete change
# observations bucket.
importances_incompl_obs_all = np.concatenate((importances_incompl_obs, importances_compl_obs))
ch_rates_incompl_obs_actual_all = np.concatenate((ch_rates_incompl_obs_actual, ch_rates_compl_obs_actual))
return LambdaLearnAndCrawl_X(LambdaCrawlBinary_Epsilon, importances_incompl_obs_all, ch_rates_incompl_obs_actual_all, epsilon_incompl_obs, \
[], [], bandwidth, epsilon_learn, epoch_length, suffix_len, horizon)
# Computes the n-th Harmonic number.
def Harmonic(n):
# Not very efficient for large n, but we don't expect n to be large under LambdaCrawl
return sum(1/k for k in range(1, n + 1))
# Computes the time-averaged harmonic penalty for a LambdaCrawl policy given a history of changes for a mixed set of sources with complete and
# incomplete change observations.
#
# Note that we need the history of changes themselves, not of observations, to do this evaluation. See EvalTrace for more details.
def EvalMixedTrace(importances_incompl_obs, changes_incompl_obs, crawls_incompl_obs, importances_compl_obs, changes_compl_obs, \
crawls_compl_obs, horizon):
return (len(importances_incompl_obs) * EvalTrace(importances_incompl_obs, changes_incompl_obs, crawls_incompl_obs, horizon) + \
len(importances_compl_obs) * EvalTrace(importances_compl_obs, changes_compl_obs, crawls_compl_obs, horizon)) / (len(importances_incompl_obs) + len(importances_compl_obs))
# Computes the time-averaged harmonic penalty for a LambdaCrawl policy.
def EvalTrace(importances, ch_hists, crawl_hists, horizon):
if (len(importances) == 0):
return 0
# This function returns the harmonic *cost* of a trace. I.e., the lower value it returns, the better
J_pi = 0
for w in range(len(importances)):
# If the source never changed during the observation period, its contribution to the penalty is 0
if (not ch_hists[w]):
continue
# Otherwise, if there were changes but no crawls, count the number of changes before the horizon
if (not crawl_hists[w]):
num_changes = 0
for t in range(len(ch_hists[w])):
if (ch_hists[w][t] <= horizon):
num_changes += 1
else:
break
J_pi += (importances[w] * Harmonic(num_changes) / horizon)
continue
curr_num_changes = 0
running_penalty = 0
curr_change_time_idx = 0
curr_crawl_time_idx = 0
while (curr_change_time_idx < len(ch_hists[w]) and ch_hists[w][curr_change_time_idx] <= horizon):
# To continue the loop below we need to have either (a) unprocessed crawls s.t. their crawl times are after the current
# change time (but before the horizon), or (b) no such crawls, but the time horizon hasn't been reached yet.
while (((curr_crawl_time_idx >= len(crawl_hists[w]) or crawl_hists[w][curr_crawl_time_idx] > horizon) and ch_hists[w][curr_change_time_idx] <= horizon) or \
(curr_crawl_time_idx < len(crawl_hists[w]) and crawl_hists[w][curr_crawl_time_idx] <= horizon and ch_hists[w][curr_change_time_idx] <= crawl_hists[w][curr_crawl_time_idx])):
# If the time of a crawl coincides with the time of a change exactly (this can happen in the case of sources with complete
# change observations), don't count this change -- we assume it is picked up immediately. Just advance to the next change
# time.
if (curr_crawl_time_idx >= len(crawl_hists[w]) or \
not(ch_hists[w][curr_change_time_idx] == crawl_hists[w][curr_crawl_time_idx])):
curr_num_changes += 1
if (curr_crawl_time_idx < len(crawl_hists[w]) and \
ch_hists[w][curr_change_time_idx] == crawl_hists[w][curr_crawl_time_idx]):
curr_change_time_idx += 1
break
curr_change_time_idx += 1
if (curr_change_time_idx >= len(ch_hists[w])):
break
# tally the changes we missed
running_penalty += (0 if curr_num_changes == 0 else Harmonic(curr_num_changes))
curr_num_changes = 0
curr_crawl_time_idx += 1
J_pi += (importances[w] * running_penalty / horizon)
return J_pi / len(importances)
# Extends the change history for a set of sources from curr_time up to horizon.
#
# To extend the history for a given source, samples change times from the source's Poisson change process with a given rate parameter.
def ExtendChangeHistory(ch_hists, ch_rates, curr_time, horizon):
for w in range(len(ch_hists)):
if (len(ch_hists[w]) == 0):
ch_hists[w].append(np.random.exponential(1.0 / ch_rates[w]))
while (ch_hists[w][-1] < curr_time + horizon):
ch_hists[w].append(ch_hists[w][-1] + np.random.exponential(1.0 / ch_rates[w]))
# Extends the observation history for sources with incomplete change observations.
#
# This method assumes that both the change and crawl history for sources with incomplete observations has *already* been extended
# until (or just past) curr_time + horizon. This means that for each such source both its change history and its crawl history is
# assumed to contain at least one element.
#
# WARNING: DO NOT CALL THIS METHOD TWICE WITH THE SAME (curr_time, horizon) PAIR. calling this method twice with the same (curr_time, horizon) may
# reappend existing observations and thereby invalidate the observation history.
def ExtendObsHistory_IncomplObs(ch_hists, crawl_hists, observations_incompl_obs, curr_time, horizon):
for w in range(len(observations_incompl_obs)):
# First, find the last crawl in the time interval of interest, [curr_time, curr_time + horizon]. It's possible that the latest
# scheduled crawl overall is after curr_time+horizon and therefore hasn't happened yet -- ignore it
i = -1
while (abs(i) <= len(crawl_hists[w]) and crawl_hists[w][i] > curr_time + horizon):
i -= 1
if (abs(i) > len(crawl_hists[w]) or crawl_hists[w][i] <= curr_time):
# No crawls happened during time interval [curr_time, curr_time + horizon], so no new observations
continue;
i_last_crawl_in_interval = i
# Now, find the first crawl in the interval [curr_time, curr_time + horizon]
while (abs(i) <= len(crawl_hists[w]) and crawl_hists[w][i] > curr_time):
i -= 1
# Go back to i before the last subtraction. That's the index of the first crawl timestamp in the interval
# [curr_time, curr_time + horizon]
i_first_crawl_in_interval = i + 1
# For each crawl between and including these two, we need to determine whether we would observe any changes since the previous
# crawl. Note that i_crawl is always non-positive
for i_crawl in range(i_first_crawl_in_interval, i_last_crawl_in_interval + 1):
previous_crawl_time = crawl_hists[w][i_crawl - 1] if (abs(i_crawl - 1) <= len(crawl_hists[w])) else 0
# Find the index of the timestamp of the latest change before the latest crawl
j = -1
while (abs(j) <= len(ch_hists[w]) and ch_hists[w][j] > crawl_hists[w][i_crawl]):
j -= 1
if (abs(j) > len(ch_hists[w])):
observations_incompl_obs[w].append((crawl_hists[w][i_crawl], 0))
else:
# If this timestamp is after the previous crawl time, we know that there has been at least one change since the previous
# crawl (although there may have been more -- we wouldn't be able to tell the difference based on this observation!)
# and record this in our observation history. Conversely, we also know that if this timestamp is before the previous crawl
# time, there could not have been any change since the previous crawl, so we would observe no changes
if (ch_hists[w][j] > previous_crawl_time):
observations_incompl_obs[w].append((crawl_hists[w][i_crawl], 1))
else:
observations_incompl_obs[w].append((crawl_hists[w][i_crawl], 0))
# Extends the crawl history for sources with complete change observations.
#
# This method assumes that the change history has *already* been extended up to curr_time + horizon
# For every change in the interval [curr_time, curr_time + horizon] it then decides whether to crawl
# at that change's time or not.
#
# WARNING! DO NOT CALL THIS METHOD TWICE WITH THE SAME (curr_time, horizon) PAIR. Calling this method twice on the same change history for the same
# curr_time, horizon will resample a crawl for every change and append the sampled timestamps onto the crawl history, thereby invalidating it.
def ExtendCrawlHistory_ComplObs(ch_hists, crawl_hists, crawl_probs_compl_obs, curr_time, horizon):
for w in range(len(crawl_hists)):
# Find the index of the first change timestamp in [curr_time, curr_time + horizon]
i = 0
while ((abs(i - 1) <= len(ch_hists[w])) and ch_hists[w][i - 1] > curr_time):
i -= 1
while (i < 0 and ch_hists[w][i] <= curr_time + horizon):
if (np.random.binomial(1, crawl_probs_compl_obs[w]) == 1):
crawl_hists[w].append(ch_hists[w][i])
i += 1
# Extends the crawl history for sources with incomplete change observations.
#
# This method assumes that the change history has *already* been extended up to curr_time + horizon
#
# WARNING! DO NOT CALL THIS METHOD TWICE WITH THE SAME (curr_time, horizon) PAIR. Calling this method twice on the same change history for the same
# curr_time, horizon will resample crawls and append the sampled timestamps onto the crawl history, thereby invalidating it.
def ExtendCrawlHistory_IncomplObs(crawl_hists, crawl_rates_incompl_obs, curr_time, horizon):
for w in range(len(crawl_hists)):
if (not crawl_hists[w]):
crawl_hists[w].append(np.random.exponential(1.0 / crawl_rates_incompl_obs[w]))
else:
# Remove all crawl times, sampled using a previous crawl rate, that are scheduled after the first timestamp
# sampled using the current crawl rate
first_sample = curr_time + np.random.exponential(1.0 / crawl_rates_incompl_obs[w])
while (crawl_hists[w] and crawl_hists[w][-1] > first_sample):
del crawl_hists[w][-1]
crawl_hists[w].append(first_sample)
while (curr_time + horizon > crawl_hists[w][-1]):
crawl_hists[w].append(crawl_hists[w][-1] + np.random.exponential(1.0 / crawl_rates_incompl_obs[w]))
# Pre-processes the dataset from the NeurIPS-2019 paper (https://github.com/microsoft/Optimal-Freshness-Crawl-Scheduling, see the README for the #
# dataset format) for consumption by the scheduling algorithms. Produces two files, one for URLS with complete and one for URLs with incomplete
# change observations. Each file has the tab-separated format
#
# Imp_1 ChangeRate_1
# ... ...
# Imp_N ChangeRate_N
#
# where Imp_i and ChangeRate_i are the importance score and change rate of some URL. For complete-change-observation URLs both of these pieces
# of data are already in the dataset itself. For incomplete-change-observation URLs the change rates need to be learned from the dataset's crawl
# and observation history.
def ProcessRawData(imps_data_file, changes_data_file, change_rates_compl_obs_data_file, \
out_incompl_obs_file="imps_and_chrates_incompl.txt", out_compl_obs_file="imps_and_chrates_compl.txt", delimiter='\t'):
# Dictionary format: key is the URL ID, value is a triplet [importance, change rate, flag with value 1 if the URL has a complete observation
# history available and 0 otherwise]
url_data = dict()
print("Reading the importance scores data...")
with open(imps_data_file) as imps_data:
line = imps_data.readline()
while line:
tokens = line.strip().split(delimiter, 1)
url_data[int(tokens[0])] = [int(tokens[1]), 0, 0]
line = imps_data.readline()
print("Learning the change rates for URLs with incomplete change observations...")
num_high_chrates = 0;
with open(changes_data_file) as changes_data:
line = changes_data.readline()
num_lines = 1
while line:
tokens = line.strip().split(delimiter, 2)
training_intervals = np.array(json.loads(tokens[2]))
# This is a reference to training_intervals, so operations on it will change training_intervals's contents!
training_timestamps = training_intervals
for i in range(1, training_timestamps.shape[0]):
training_timestamps[i, 0] += training_timestamps[i - 1, 0]
# Here we apply Equation 10 from the NeurIPS-2019 paper to learn URLs' Poisson change rates from histories.
learned_rate = LearnChRates_IncomplObs(training_timestamps, math.inf, sum(training_timestamps[:,0]), epsilon_learn_global)
url_data[int(tokens[0])][1] = learned_rate
if (num_lines % 10000 == 0):
print("Processed {} URLs".format(num_lines))
num_lines += 1
line = changes_data.readline()
print("Processing the change rates for URLs with complete change observations...")
with open(change_rates_compl_obs_data_file) as change_rates_compl_obs_data:
line = change_rates_compl_obs_data.readline()
while line:
tokens = line.strip().split(delimiter, 1)
id = int(tokens[0])
url_data[id][1] = float(tokens[1])
url_data[id][2] = 1
line = change_rates_compl_obs_data.readline()
print("Outputting data to files...")
incompl_obs_out = open(out_incompl_obs_file,"w+")
num_incompl_obs_records = 0
compl_obs_out = open(out_compl_obs_file,"w+")
num_compl_obs_records = 0
for record in url_data.values():
if (record[2] == 0):
incompl_obs_out.write("%d\t%f\n" % (record[0], record[1]))
num_incompl_obs_records += 1
elif (record[2] == 1):
compl_obs_out.write("%d\t%f\n" % (record[0], record[1]))
num_compl_obs_records += 1
print("A total of {} records written, {} for incomplete- and {} for complete-change-observation URLs.".format(num_incompl_obs_records + num_compl_obs_records, num_incompl_obs_records, num_compl_obs_records))
# This method computes the policy and its harmonic and binary costs for LambdaCrawl, LambdaCrawlApprox, BinaryLambdaCrawl(0.0),
# BinaryLambdaCrawl(0.4), UniformCrawl, and ChangeRateProportionalCrawl, assuming known change rates. The inputs are 2D arrays containing importance-
# change rate pairs for incomplete- and complete-change-observation sources.
def ExperimentHelper(importances_and_ch_rates_incompl_obs, importances_and_ch_rates_compl_obs):
# We can choose to subsample and experiment with a subset of the whole dataset.
size = importances_and_ch_rates_incompl_obs.shape[0] + importances_and_ch_rates_compl_obs.shape[0]
# Determine the fraction of the overall dataset to sample in the current iteration
sample_frac = size / (importances_and_ch_rates_incompl_obs.shape[0] + importances_and_ch_rates_compl_obs.shape[0])
if (sample_frac >= 1.0):
importances_and_ch_rates_incompl_obs_partial = importances_and_ch_rates_incompl_obs
importances_and_ch_rates_compl_obs_partial = importances_and_ch_rates_compl_obs
else:
# Sample this fraction of a subset of the sources with incomplete change observations
if (importances_and_ch_rates_incompl_obs.shape[0] > 0):
importances_and_ch_rates_incompl_obs_partial = importances_and_ch_rates_incompl_obs[np.random.choice( \
importances_and_ch_rates_incompl_obs.shape[0], int(sample_frac * importances_and_ch_rates_incompl_obs.shape[0]), replace=False),:]
else:
importances_and_ch_rates_incompl_obs_partial = np.empty([0,2])
# Sample this fraction of a subset of the sources with complete change observations
if (importances_and_ch_rates_compl_obs.shape[0] > 0):
importances_and_ch_rates_compl_obs_partial = importances_and_ch_rates_compl_obs[np.random.choice( \
importances_and_ch_rates_compl_obs.shape[0], int(sample_frac * importances_and_ch_rates_compl_obs.shape[0]), replace=False),:]
else:
importances_and_ch_rates_compl_obs_partial = np.empty([0,2])
importances_incompl_obs = importances_and_ch_rates_incompl_obs_partial[:,0]
ch_rates_incompl_obs = importances_and_ch_rates_incompl_obs_partial[:,1]
importances_compl_obs = importances_and_ch_rates_compl_obs_partial[:,0]
ch_rates_compl_obs = importances_and_ch_rates_compl_obs_partial[:,1]
bandwidth = 0.2 * (importances_and_ch_rates_incompl_obs_partial.shape[0] + importances_and_ch_rates_compl_obs_partial.shape[0])
epsilon_incompl_obs = 0.1
importances = np.concatenate((importances_and_ch_rates_incompl_obs_partial[:,0], importances_and_ch_rates_compl_obs_partial[:,0]))
ch_rates = np.concatenate((importances_and_ch_rates_incompl_obs_partial[:,1], importances_and_ch_rates_compl_obs_partial[:,1]))
print("\n*****Running on a set of ", importances_and_ch_rates_incompl_obs_partial.shape[0], \
" sources with incomplete observations and ", importances_and_ch_rates_compl_obs_partial.shape[0], \
" sources with complete observations.", sep='')
#==================================================================================
# Random
print("\n*****Running UniformCrawl.")
crawl_rates = np.ones_like(importances) * bandwidth / len(importances)
policy_cost_harmonic = HarmonicPolicyCost(crawl_rates, [], importances, ch_rates, [], [])
policy_cost_binary = BinaryPolicyCost(crawl_rates, [], importances, ch_rates, [], [])
print("\tHarmonic policy cost was ", policy_cost_harmonic, ".\n\tBinary policy cost was ", policy_cost_binary, ".", sep='')
#===================================================================================
#==================================================================================
# Change-rate-proportional
print("\n")
print("*****Running ChangeRateProportionalCrawl on a set of ", importances_and_ch_rates_incompl_obs_partial.shape[0], \
" sources with incomplete observations and ", importances_and_ch_rates_compl_obs_partial.shape[0], \
" sources with complete observations.", sep='')
crawl_rates = ch_rates * bandwidth / sum(ch_rates)
policy_cost_harmonic = HarmonicPolicyCost(crawl_rates, [], importances, ch_rates, [], [])
policy_cost_binary = BinaryPolicyCost(crawl_rates, [], importances, ch_rates, [], [])
print("\tHarmonic policy cost was ", policy_cost_harmonic, ".\n\tBinary policy cost was ", policy_cost_binary, ".", sep='')
#===================================================================================
#===================================================================================
# Epsilon-lambda-crawl-binary with the best parameter value for harmonic-cost performance on the NeurIPS-2019 paper's dataset.
print("\n")
print("*****Running BinaryLambdaCrawl-Epsilon (**all sources are treated as having incomplete change observations**).")
epsilon = 0.4
crawl_rates_incompl_obs, crawl_probs_compl_obs = LambdaCrawlBinary_Epsilon(importances, ch_rates, epsilon, [], [], bandwidth)
policy_cost_harmonic = HarmonicPolicyCost(crawl_rates_incompl_obs, [], importances, ch_rates, [], [])
policy_cost_binary = BinaryPolicyCost(crawl_rates_incompl_obs, [], importances, ch_rates, [], [])
print("\tHarmonic policy cost with epsilon=", epsilon," was: ", policy_cost_harmonic, ".\n\tBinary policy cost with epsilon=", \
epsilon," was: ", policy_cost_binary, ".", sep='')
#===================================================================================
#===================================================================================
# Vanilla Epsilon-lambda-crawl-binary
print("\n")
print("*****Running BinaryLambdaCrawl-Epsilon (**all sources are treated as having incomplete change observations**).")
epsilon = 0.0
crawl_rates_incompl_obs, crawl_probs_compl_obs = LambdaCrawlBinary_Epsilon(importances, ch_rates, epsilon, [], [], bandwidth)
policy_cost_harmonic = HarmonicPolicyCost(crawl_rates_incompl_obs, [], importances, ch_rates, [], [])
policy_cost_binary = BinaryPolicyCost(crawl_rates_incompl_obs, [], importances, ch_rates, [], [])
print("\tHarmonic policy cost with epsilon=", epsilon," was: ", policy_cost_harmonic, ".\n\tBinary policy cost with epsilon=", \
epsilon," was: ", policy_cost_binary, ".", sep='')
#===================================================================================
#==================================================================================================
# LambdaCrawlApprox
print("\n")
print("*****Running LambdaCrawlApprox.")
crawl_rates_approx_incompl_obs, crawl_probs_approx_compl_obs = LambdaCrawlApprox(importances_incompl_obs, ch_rates_incompl_obs, \
epsilon_incompl_obs, importances_compl_obs, ch_rates_compl_obs, bandwidth)
policy_cost_harmonic = HarmonicPolicyCost(crawl_rates_approx_incompl_obs, crawl_probs_approx_compl_obs, importances_incompl_obs, \
ch_rates_incompl_obs, importances_compl_obs, ch_rates_compl_obs)
policy_cost_binary = BinaryPolicyCost(crawl_rates_approx_incompl_obs, crawl_probs_approx_compl_obs, importances_incompl_obs, \
ch_rates_incompl_obs, importances_compl_obs, ch_rates_compl_obs)
print("\tHarmonic policy cost was: ", policy_cost_harmonic, ".\n\tBinary policy cost was: ", policy_cost_binary, ".", sep='')
#===============================================================================================================================
#===============================================================================================================================
# LambdaCrawlApprox ignoring complete change observations
print("\n")
print("*****Running LambdaCrawlApprox (**all sources are treated as having incomplete change observations**).")
crawl_rates_approx_incompl_obs, crawl_probs_approx_compl_obs = LambdaCrawlApprox(importances, ch_rates, epsilon_incompl_obs, [], [], bandwidth)
policy_cost_harmonic = HarmonicPolicyCost(crawl_rates_approx_incompl_obs, crawl_probs_approx_compl_obs, importances, ch_rates, [], [])
policy_cost_binary = BinaryPolicyCost(crawl_rates_approx_incompl_obs, crawl_probs_approx_compl_obs, importances, ch_rates, [], [])
print("\tHarmonic policy cost was: ", policy_cost_harmonic, ".\n\tBinary policy cost was: ", policy_cost_binary, ".", sep='')
#===============================================================================================================================
#===============================================================================================================================
# LambdaCrawl
print("\n")
print("*****Running LambdaCrawl.")
crawl_rates_incompl_obs, crawl_probs_compl_obs = LambdaCrawl(importances_incompl_obs, ch_rates_incompl_obs, epsilon_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs, bandwidth)
policy_cost_harmonic = HarmonicPolicyCost(crawl_rates_incompl_obs, crawl_probs_compl_obs, importances_incompl_obs, ch_rates_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs)
policy_cost_binary = BinaryPolicyCost(crawl_rates_incompl_obs, crawl_probs_compl_obs, importances_incompl_obs, ch_rates_incompl_obs, \
importances_compl_obs, ch_rates_compl_obs)
print("\tHarmonic policy cost was: ", policy_cost_harmonic, ".\n\tBinary policy cost was: ", policy_cost_binary, ".", sep='')
#===============================================================================================================================
#===============================================================================================================================
# LambdaCrawl ignoring complete change observations
print("\n")
print("*****Running LambdaCrawl (**all sources are treated as having incomplete change observations**).")
crawl_rates_incompl_obs, crawl_probs_compl_obs = LambdaCrawl(importances, ch_rates, epsilon_incompl_obs, [], [], bandwidth)
policy_cost_harmonic = HarmonicPolicyCost(crawl_rates_incompl_obs, crawl_probs_compl_obs, importances, ch_rates, [], [])
policy_cost_binary = BinaryPolicyCost(crawl_rates_incompl_obs, crawl_probs_compl_obs, importances, ch_rates, [], [])
print("\tHarmonic policy cost was: ", policy_cost_harmonic, ".\n\tBinary policy cost was: ", policy_cost_binary, ".", sep='')
# Compute means and confidence intervals. Adapted from the Stack Overflow post at
# https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
def mean_confidence_interval(data, confidence = 0.95):
a = 1.0 * data
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
return m, h
def process_results(runs):
ms = np.zeros_like(runs[0,:])
hs = np.zeros_like(ms)
for j in range(runs.shape[1]):
m, h = mean_confidence_interval(runs[:,j])
ms[j] = m
hs[j] = h
return ms, hs
# This method produces results as is Figure 1 of the NeurIPS-2019 paper. It computes the policy and its harmonic and binary costs for LambdaCrawl,
# LambdaCrawlApprox, BinaryLambdaCrawl(0.0), BinaryLambdaCrawl(0.4), UniformCrawl, and ChangeRateProportionalCrawl, assuming known change rates,
# on the full dataset from the NeurIPS-2019 paper.
def Experiment1(importances_and_ch_rates_incompl_obs_file = "imps_and_chrates_incompl.txt", \
importances_and_ch_rates_compl_obs_file="imps_and_chrates_compl.txt"):
print("Reading the data...")
importances_and_ch_rates_incompl_obs = np.genfromtxt(importances_and_ch_rates_incompl_obs_file, delimiter="\t")
importances_and_ch_rates_compl_obs = np.genfromtxt(importances_and_ch_rates_compl_obs_file, delimiter="\t")
ExperimentHelper(importances_and_ch_rates_incompl_obs, importances_and_ch_rates_compl_obs)
# This method produces results as is Figure 2 of the NeurIPS-2019 paper. It computes the policy and its harmonic and binary costs for LambdaCrawl,
# LambdaCrawlApprox, BinaryLambdaCrawl(0.0), and BinaryLambdaCrawl(0.4) on the set of URLs with complete observations only. Out of the above
# algorithms, only LambdaCrawl and LambdaCrawlApprox have special handling for these observations, the rest ignore them. For the sake of comparison,
# we also run LambdaCrawl and LambdaCrawlApprox in "ignorance mode" as well as in normal mode.
def Experiment2(importances_and_ch_rates_compl_obs_file="imps_and_chrates_compl.txt"):
print("Reading the data...")
importances_and_ch_rates_incompl_obs = np.empty([0,2])
importances_and_ch_rates_compl_obs = np.genfromtxt(importances_and_ch_rates_compl_obs_file, delimiter="\t")
ExperimentHelper(importances_and_ch_rates_incompl_obs, importances_and_ch_rates_compl_obs)
# This method produces results as is Figure 3 of the NeurIPS-2019 paper. It simulates runs of the RL algorithms LambdaLearnAndCrawl,
# LambdaLearnAndCrawlApprox, and BinaryLambdaLearnAndCrawl using the ground-truth Poisson change rates previously learned from the dataset,