ixa/random/
sampling_algorithms.rs

1//! Algorithms for uniform random sampling from hash sets or iterators. These algorithms are written to be generic
2//! over the container type using zero-cost trait abstractions.
3
4use std::borrow::Borrow;
5
6use crate::rand::seq::index::sample as choose_range;
7use crate::rand::Rng;
8
9/// Samples one element uniformly at random from an iterator whose length is known at runtime.
10///
11/// The caller must ensure that `(len, Some(len)) == iter.size_hint()`, i.e. the iterator
12/// reports its exact length via `size_hint`. We do not require `ExactSizeIterator`
13/// because that is a compile-time guarantee, whereas our requirement is a runtime condition.
14///
15/// The implementation selects a random index and uses `Iterator::nth`. For iterators
16/// with O(1) `nth` (e.g., randomly indexable structures), this is very efficient.
17/// The selected value is cloned.
18///
19/// The iterator need only support iteration; random indexing is not required.
20/// This function is intended for use when the result set is indexed and its length is known.
21pub fn sample_single_from_known_length<I, R, T>(rng: &mut R, mut iter: I) -> Option<T>
22where
23    R: Rng,
24    I: Iterator<Item = T>,
25{
26    // It is the caller's responsibility to ensure that `(len, Some(len)) == iter.size_hint()`.
27    let (length, _) = iter.size_hint();
28    if length == 0 {
29        return None;
30    }
31    // This little trick with `u32` makes this function 30% faster.
32    let index = rng.random_range(0..length as u32) as usize;
33    // The set need not be randomly indexable, so we have to use the `nth` method.
34    iter.nth(index)
35}
36
37/// Sample a random element uniformly from an iterator of unknown length.
38///
39/// We do not assume the container is randomly indexable, only that it can be iterated over.
40///
41/// This function implements "Algorithm L" from KIM-HUNG LI
42/// Reservoir-Sampling Algorithms of Time Complexity O(n(1 + log(N/n)))
43/// <https://dl.acm.org/doi/pdf/10.1145/198429.198435>
44///
45/// This algorithm is significantly slower than the "known length" algorithm (factor
46/// of 10^4). The reservoir algorithm from [`rand`](crate::rand) reduces to the "known length"
47/// algorithm when `iterator.size_hint()` returns `(k, Some(k))` for some `k`. Otherwise,
48/// this algorithm is much faster than the [`rand`](crate::rand)  implementation (factor of 100).
49pub fn sample_single_l_reservoir<I, R, T>(rng: &mut R, iterable: I) -> Option<T>
50where
51    R: Rng,
52    I: IntoIterator<Item = T>,
53{
54    let mut iter = iterable.into_iter();
55    let mut weight: f64 = rng.random(); // controls skip distance distribution
56    let mut log_one_minus_weight = (-weight).ln_1p();
57    let mut chosen_item: T = iter.next()?; // the currently selected element
58
59    // Number of elements to skip before the next candidate to consider for the reservoir.
60    // `iter.nth(skip)` skips `skip` elements and returns the next one.
61    let mut skip = (rng.random::<f64>().ln() / log_one_minus_weight).floor() as usize;
62    weight *= rng.random::<f64>();
63    log_one_minus_weight = (-weight).ln_1p();
64
65    loop {
66        match iter.nth(skip) {
67            Some(item) => {
68                chosen_item = item;
69                skip = (rng.random::<f64>().ln() / log_one_minus_weight).floor() as usize;
70                weight *= rng.random::<f64>();
71                log_one_minus_weight = (-weight).ln_1p();
72            }
73            None => return Some(chosen_item),
74        }
75    }
76}
77
78/// Count elements and sample one element uniformly from an iterator of unknown
79/// length.
80///
81/// Returns `(count, sample)` where `count` is the total number of items observed
82/// and `sample` is `None` iff `count == 0`.
83///
84/// This uses single-item reservoir sampling while tracking total count.
85pub fn count_and_sample_single_l_reservoir<I, R, T>(rng: &mut R, iterable: I) -> (usize, Option<T>)
86where
87    R: Rng,
88    I: IntoIterator<Item = T>,
89{
90    let mut count = 0usize;
91    let mut chosen_item: Option<T> = None;
92
93    for item in iterable {
94        count += 1;
95        if rng.random_range(0..count as u64) == 0 {
96            chosen_item = Some(item);
97        }
98    }
99
100    (count, chosen_item)
101}
102
103/// Samples `requested` elements uniformly at random without replacement from an iterator
104/// whose length is known at runtime. Requires `len >= requested`.
105///
106/// The caller must ensure that `(len, Some(len)) == iter.size_hint()`, i.e. the iterator
107/// reports its exact length via `size_hint`. We do not require `ExactSizeIterator`
108/// because that is a compile-time guarantee, whereas our requirement is a runtime condition.
109///
110/// The implementation selects random indices and uses `Iterator::nth`. For iterators
111/// with O(1) `nth` (e.g., randomly indexable structures), this is very efficient.
112/// Selected values are cloned.
113///
114/// This strategy is particularly effective for small `requested` (≤ 5), since it
115/// avoids iterating over the entire set and is typically faster than reservoir sampling.
116pub fn sample_multiple_from_known_length<I, R, T>(rng: &mut R, iter: I, requested: usize) -> Vec<T>
117where
118    R: Rng,
119    I: IntoIterator<Item = T>,
120{
121    let mut iter = iter.into_iter();
122    // It is the caller's responsibility to ensure that `(length, Some(length)) == iter.size_hint()`.
123    let (length, _) = iter.size_hint();
124
125    let mut indexes = Vec::with_capacity(requested);
126    indexes.extend(choose_range(rng, length, requested));
127    indexes.sort_unstable();
128
129    let mut selected = Vec::with_capacity(requested);
130    let mut consumed: usize = 0; // number of elements consumed from the iterator so far
131
132    // `iter.nth(n)` skips `n` elements and returns the next one, so to reach
133    // index `idx` we skip `idx - consumed` where `consumed` tracks how many
134    // elements have already been consumed.
135    for idx in indexes {
136        if let Some(item) = iter.nth(idx - consumed) {
137            selected.push(item);
138        }
139        consumed = idx + 1;
140    }
141
142    selected
143}
144
145/// Sample multiple random elements uniformly without replacement from a container of unknown length. If
146/// more samples are requested than are in the set, the function returns as many items as it can.
147///
148/// The implementation uses `Iterator::nth`. Randomly indexable structures will have a O(1) `nth`
149/// implementation and will be very efficient. The values are cloned.
150///
151/// This function implements "Algorithm L" from KIM-HUNG LI
152/// Reservoir-Sampling Algorithms of Time Complexity O(n(1 + log(N/n)))
153/// <https://dl.acm.org/doi/pdf/10.1145/198429.198435>
154///
155/// This algorithm is significantly faster than the reservoir algorithm in `rand` and is
156/// on par with the "known length" algorithm for large `requested` values.
157pub fn sample_multiple_l_reservoir<I, R, T>(rng: &mut R, iter: I, requested: usize) -> Vec<T>
158where
159    R: Rng,
160    I: IntoIterator<Item = T>,
161{
162    if requested == 0 {
163        return Vec::new();
164    }
165
166    let requested_recip = 1.0 / requested as f64;
167    let mut weight: f64 = rng.random(); // controls skip distance distribution
168    weight = weight.powf(requested_recip);
169    let mut log_one_minus_weight = (-weight).ln_1p();
170    let mut iter = iter.into_iter();
171    let mut reservoir: Vec<T> = iter.by_ref().take(requested).collect(); // the sample reservoir
172
173    if reservoir.len() < requested {
174        return reservoir;
175    }
176
177    // Number of elements to skip before the next candidate to consider for the reservoir.
178    // `iter.nth(skip)` skips `skip` elements and returns the next one.
179    let mut skip = (rng.random::<f64>().ln() / log_one_minus_weight).floor() as usize;
180    let uniform_random: f64 = rng.random();
181    weight *= uniform_random.powf(requested_recip);
182    log_one_minus_weight = (-weight).ln_1p();
183
184    loop {
185        match iter.nth(skip) {
186            Some(item) => {
187                let to_remove = rng.random_range(0..reservoir.len());
188                reservoir.swap_remove(to_remove);
189                reservoir.push(item);
190
191                skip = (rng.random::<f64>().ln() / log_one_minus_weight).floor() as usize;
192                let uniform_random: f64 = rng.random();
193                weight *= uniform_random.powf(requested_recip);
194                log_one_minus_weight = (-weight).ln_1p();
195            }
196            None => return reservoir,
197        }
198    }
199}
200
201/// Samples one element uniformly at random from `slice`, excluding any element
202/// equal to `excluded`. Returns `None` if the slice is empty or every element
203/// equals `excluded`.
204///
205/// `excluded` accepts either an owned `T` or a borrowed `&T` via the
206/// `Borrow<T>` bound. Dispatches to `sample_single_excluding_iteration` for slices of
207/// length `< 4` and `sample_single_excluding_rejection` otherwise. Tuned via
208/// `ixa-bench/criterion/sample_single_excluding.rs`.
209pub fn sample_single_excluding<'a, R, T, E>(
210    rng: &mut R,
211    slice: &'a [T],
212    excluded: E,
213) -> Option<&'a T>
214where
215    R: Rng,
216    T: PartialEq,
217    E: Borrow<T>,
218{
219    const SMALL_SLICE: usize = 4;
220    let excluded = excluded.borrow();
221    if slice.len() < SMALL_SLICE {
222        sample_single_excluding_iteration(rng, slice, excluded)
223    } else {
224        sample_single_excluding_rejection(rng, slice, excluded)
225    }
226}
227
228/// Linear-scan implementation of `sample_single_excluding`. Counts non-excluded
229/// entries, then picks the k-th. Wins for very small slices (`n <= 3`) where
230/// the per-trial overhead of rejection sampling exceeds the cost of a tiny
231/// filter. Exposed so benchmarks can compare strategies directly.
232pub fn sample_single_excluding_iteration<'a, R, T, E>(
233    rng: &mut R,
234    slice: &'a [T],
235    excluded: E,
236) -> Option<&'a T>
237where
238    R: Rng,
239    T: PartialEq,
240    E: Borrow<T>,
241{
242    let excluded = excluded.borrow();
243    let valid_count = slice.iter().filter(|&x| x != excluded).count();
244    if valid_count == 0 {
245        return None;
246    }
247    let k = rng.random_range(0..valid_count as u32) as usize;
248    slice.iter().filter(|&x| x != excluded).nth(k)
249}
250
251/// Rejection-sampling implementation of `sample_single_excluding`. Picks a
252/// uniform index, accepts if not equal to `excluded`. Wins at `n >= 4` and is
253/// essentially constant time when `excluded` appears 0 or 1 times. Falls
254/// through to `sample_single_excluding_iteration` after at most 16 consecutive
255/// matches (or `n`, whichever is smaller), which also returns `None` when
256/// every element matches. Exposed so benchmarks can compare strategies
257/// directly.
258pub fn sample_single_excluding_rejection<'a, R, T, E>(
259    rng: &mut R,
260    slice: &'a [T],
261    excluded: E,
262) -> Option<&'a T>
263where
264    R: Rng,
265    T: PartialEq,
266    E: Borrow<T>,
267{
268    // The `u32` cast on `random_range` arguments is faster than the `usize`
269    // form (see `sample_single_from_known_length`).
270    //
271    // Cap trials at `min(MAX_REJECTIONS, n)`: once we've drawn `n` indices
272    // and all matched, almost the entire slice equals `excluded` and the
273    // iteration path is cheaper than retrying.
274    const MAX_REJECTIONS: usize = 16;
275    if slice.is_empty() {
276        return None;
277    }
278    let excluded = excluded.borrow();
279    let trials = MAX_REJECTIONS.min(slice.len());
280    for _ in 0..trials {
281        let i = rng.random_range(0..slice.len() as u32) as usize;
282        let candidate = &slice[i];
283        if candidate != excluded {
284            return Some(candidate);
285        }
286    }
287    sample_single_excluding_iteration(rng, slice, excluded)
288}
289
290/// Sample one element uniformly from an iterator, excluding any element equal
291/// to `excluded`. Returns `None` if the iterator is empty or every element
292/// equals `excluded`.
293///
294/// This is the iterator counterpart to [`sample_single_excluding`]. It runs
295/// in O(n) time and is correct even when the iterator does not report an
296/// exact length. Prefer [`sample_single_excluding`] for slices, which can
297/// dispatch to a faster rejection-sampling strategy backed by random access.
298pub fn sample_single_excluding_l_reservoir<I, R, T, E>(
299    rng: &mut R,
300    iterable: I,
301    excluded: E,
302) -> Option<T>
303where
304    R: Rng,
305    T: PartialEq,
306    I: IntoIterator<Item = T>,
307    E: Borrow<T>,
308{
309    let excluded = excluded.borrow();
310    let (_, chosen) = count_and_sample_single_l_reservoir(
311        rng,
312        iterable.into_iter().filter(|item| item != excluded),
313    );
314    chosen
315}
316
317#[cfg(test)]
318mod tests {
319    use rand::rngs::StdRng;
320    use rand::SeedableRng;
321
322    use super::*;
323    use crate::hashing::{HashSet, HashSetExt};
324
325    #[test]
326    fn test_sample_single_l_reservoir_basic() {
327        let data: Vec<u32> = (0..1000).collect();
328        let seed: u64 = 42;
329        let mut rng = StdRng::seed_from_u64(seed);
330        let sample = sample_single_l_reservoir(&mut rng, data);
331
332        // Should return Some value
333        assert!(sample.is_some());
334
335        // Value should be in valid range
336        let value = sample.unwrap();
337        assert!(value < 1000);
338    }
339
340    #[test]
341    fn test_sample_single_l_reservoir_empty() {
342        let data: Vec<u32> = Vec::new();
343        let mut rng = StdRng::seed_from_u64(42);
344        let sample = sample_single_l_reservoir(&mut rng, data);
345
346        // Should return None for empty container
347        assert!(sample.is_none());
348    }
349
350    #[test]
351    fn test_sample_single_l_reservoir_single_element() {
352        let data: Vec<u32> = vec![42];
353        let mut rng = StdRng::seed_from_u64(1);
354        let sample = sample_single_l_reservoir(&mut rng, data);
355
356        // Should return the only element
357        assert_eq!(sample, Some(42));
358    }
359
360    #[test]
361    fn test_sample_single_l_reservoir_uniformity() {
362        let population: u32 = 1000;
363        let data: Vec<u32> = (0..population).collect();
364        let num_runs = 10000;
365        let num_bins = 10;
366        let mut counts = vec![0usize; num_bins];
367
368        for run in 0..num_runs {
369            let mut rng = StdRng::seed_from_u64(42 + run as u64);
370            let sample = sample_single_l_reservoir(&mut rng, data.iter().cloned());
371
372            if let Some(value) = sample {
373                let bin = (value as usize) / (population as usize / num_bins);
374                counts[bin] += 1;
375            }
376        }
377
378        // Expected count per bin for uniform sampling
379        let expected = num_runs as f64 / num_bins as f64;
380
381        // Compute chi-square statistic
382        let chi_square: f64 = counts
383            .iter()
384            .map(|&obs| {
385                let diff = (obs as f64) - expected;
386                diff * diff / expected
387            })
388            .sum();
389
390        // Degrees of freedom = num_bins - 1 = 9
391        // Critical χ²₀.₉₉₉ for df=9 is 27.877
392        let critical = 27.877;
393
394        println!("χ² = {}, counts = {:?}", chi_square, counts);
395
396        assert!(
397            chi_square < critical,
398            "Single sample fails uniformity test: χ² = {}, counts = {:?}",
399            chi_square,
400            counts
401        );
402    }
403
404    #[test]
405    fn test_sample_single_l_reservoir_hashset() {
406        let mut data = HashSet::new();
407        for i in 0..100 {
408            data.insert(i);
409        }
410
411        let mut rng = StdRng::seed_from_u64(42);
412        let sample = sample_single_l_reservoir(&mut rng, &data);
413
414        assert!(sample.is_some());
415        let value = sample.unwrap();
416        assert!(data.contains(value));
417    }
418
419    #[test]
420    fn test_count_and_sample_single_l_reservoir_empty() {
421        let data: Vec<u32> = Vec::new();
422        let mut rng = StdRng::seed_from_u64(42);
423        let (count, sample) = count_and_sample_single_l_reservoir(&mut rng, data);
424        assert_eq!(count, 0);
425        assert!(sample.is_none());
426    }
427
428    #[test]
429    fn test_count_and_sample_single_l_reservoir_count_matches() {
430        let data: Vec<u32> = (0..1000).collect();
431        let mut rng = StdRng::seed_from_u64(42);
432        let (count, sample) = count_and_sample_single_l_reservoir(&mut rng, data);
433        assert_eq!(count, 1000);
434        assert!(sample.is_some());
435    }
436
437    #[test]
438    fn test_count_and_sample_single_l_reservoir_single_element() {
439        let data: Vec<u32> = vec![7];
440        let mut rng = StdRng::seed_from_u64(42);
441        let (count, sample) = count_and_sample_single_l_reservoir(&mut rng, data);
442        assert_eq!(count, 1);
443        assert_eq!(sample, Some(7));
444    }
445
446    #[test]
447    fn test_sample_multiple_l_reservoir_basic() {
448        let data: Vec<u32> = (0..1000).collect();
449        let requested = 100;
450        let seed: u64 = 42;
451        let mut rng = StdRng::seed_from_u64(seed);
452        let sample = sample_multiple_l_reservoir(&mut rng, data, requested);
453
454        // Correct sample size
455        assert_eq!(sample.len(), requested);
456
457        // All sampled values are within the valid range
458        assert!(sample.iter().all(|v| *v < 1000));
459
460        // The sample should not have duplicates
461        let unique: HashSet<_> = sample.iter().collect();
462        assert_eq!(unique.len(), sample.len());
463    }
464
465    #[test]
466    fn test_sample_multiple_l_reservoir_empty() {
467        let data: Vec<u32> = Vec::new();
468        let mut rng = StdRng::seed_from_u64(42);
469        let sample = sample_multiple_l_reservoir(&mut rng, &data, 10);
470
471        // Should return empty vector for empty container
472        assert_eq!(sample.len(), 0);
473    }
474
475    #[test]
476    fn test_sample_multiple_l_reservoir_zero_requested() {
477        let data: Vec<u32> = (0..100).collect();
478        let mut rng = StdRng::seed_from_u64(42);
479        let sample = sample_multiple_l_reservoir(&mut rng, &data, 0);
480
481        // Should return empty vector when 0 requested
482        assert_eq!(sample.len(), 0);
483    }
484
485    #[test]
486    fn test_sample_multiple_l_reservoir_requested_exceeds_population() {
487        let data: Vec<u32> = (0..50).collect();
488        let requested = 100;
489        let mut rng = StdRng::seed_from_u64(42);
490        let sample = sample_multiple_l_reservoir(&mut rng, data, requested);
491
492        // Should return all available items when requested > population
493        assert_eq!(sample.len(), 50);
494
495        // All elements should be unique
496        let unique: HashSet<_> = sample.iter().collect();
497        assert_eq!(unique.len(), 50);
498
499        // All elements should be from the original data
500        assert!(sample.iter().all(|v| *v < 50));
501    }
502
503    #[test]
504    fn test_sample_multiple_l_reservoir_exact_population() {
505        let data: Vec<u32> = (0..100).collect();
506        let mut rng = StdRng::seed_from_u64(42);
507        let sample = sample_multiple_l_reservoir(&mut rng, data, 100);
508
509        // Should return all elements when requested == population
510        assert_eq!(sample.len(), 100);
511
512        let unique: HashSet<_> = sample.iter().collect();
513        assert_eq!(unique.len(), 100);
514    }
515
516    #[test]
517    fn test_sample_multiple_l_reservoir_single_element() {
518        let data: Vec<u32> = vec![42];
519        let mut rng = StdRng::seed_from_u64(1);
520        let sample = sample_multiple_l_reservoir(&mut rng, data, 1);
521
522        assert_eq!(sample.len(), 1);
523        assert_eq!(sample[0], 42);
524    }
525
526    #[test]
527    fn test_sample_multiple_l_reservoir_hashset() {
528        let mut data = HashSet::new();
529        for i in 0..100 {
530            data.insert(i);
531        }
532
533        let mut rng = StdRng::seed_from_u64(42);
534        let sample = sample_multiple_l_reservoir(&mut rng, &data, 10);
535
536        assert_eq!(sample.len(), 10);
537
538        // All sampled values should be in the original set
539        assert!(sample.iter().all(|v| data.contains(v)));
540
541        // No duplicates
542        let unique: HashSet<_> = sample.iter().collect();
543        assert_eq!(unique.len(), 10);
544    }
545
546    #[test]
547    fn test_sample_multiple_l_reservoir_small_sample() {
548        let data: Vec<u32> = (0..1000).collect();
549        let requested = 5;
550        let mut rng = StdRng::seed_from_u64(42);
551        let sample = sample_multiple_l_reservoir(&mut rng, &data, requested);
552
553        assert_eq!(sample.len(), requested);
554
555        // No duplicates
556        let unique: HashSet<_> = sample.iter().collect();
557        assert_eq!(unique.len(), requested);
558    }
559
560    #[test]
561    fn test_sample_multiple_l_reservoir_large_sample() {
562        let data: Vec<u32> = (0..1000).collect();
563        let requested = 900;
564        let mut rng = StdRng::seed_from_u64(42);
565        let sample = sample_multiple_l_reservoir(&mut rng, &data, requested);
566
567        assert_eq!(sample.len(), requested);
568
569        // No duplicates
570        let unique: HashSet<_> = sample.iter().collect();
571        assert_eq!(unique.len(), requested);
572    }
573
574    // Verifies that the reservoir sampling algorithm produces uniformly distributed
575    // samples by running it 1000 times and checking that the resulting chi-square
576    // statistics follow the expected chi-square(9) distribution. Note that this
577    // test is only approximately correct, reasonable only when `requested` is small
578    // relative to `population`, because `sample_multiple_l_reservoir` samples
579    // without replacement, while the chi-squared test assumes independent samples.
580    #[test]
581    fn test_sample_multiple_l_reservoir_uniformity() {
582        let population: u32 = 10000;
583        let data: Vec<u32> = (0..population).collect();
584        let requested = 100;
585        let num_runs = 1000;
586        let mut chi_squares = Vec::with_capacity(num_runs);
587
588        for run in 0..num_runs {
589            let mut rng = StdRng::seed_from_u64(42 + run as u64);
590            let sample = sample_multiple_l_reservoir(&mut rng, data.iter().cloned(), requested);
591
592            // Partition range 0..population into 10 equal-width bins
593            let mut counts = [0usize; 10];
594            for &value in &sample {
595                let bin = (value as usize) / (population as usize / 10);
596                counts[bin] += 1;
597            }
598
599            // Expected count per bin for uniform sampling
600            let expected = requested as f64 / 10.0; // = 10.0
601
602            // Compute chi-square statistic
603            let chi_square: f64 = counts
604                .iter()
605                .map(|&obs| {
606                    let diff = (obs as f64) - expected;
607                    diff * diff / expected
608                })
609                .sum();
610
611            chi_squares.push(chi_square);
612        }
613
614        // Now test that chi_squares follow a chi-square distribution with df=9
615        // We use quantiles of the chi-square(9) distribution to create bins
616        // and check if the observed counts match the expected uniform distribution
617
618        // Quantiles of chi-square distribution with df=9 at deciles (10 bins)
619        // These values define the bin boundaries such that each bin should contain
620        // 10% of the observations if they truly follow chi-square(9).
621        // Generate with Mathematica:
622        //     Table[Quantile[ChiSquareDistribution[9], p/10], {p, 0, 10}]//N
623        let quantiles = [
624            0.0,           // 0th percentile (minimum)
625            4.16816,       // 10th percentile
626            5.38005,       // 20th percentile
627            6.39331,       // 30th percentile
628            7.35703,       // 40th percentile
629            8.34283,       // 50th percentile (median)
630            9.41364,       // 60th percentile
631            10.6564,       // 70th percentile
632            12.2421,       // 80th percentile
633            14.6837,       // 90th percentile
634            f64::INFINITY, // 100th percentile (maximum)
635        ];
636
637        let num_bins = quantiles.len() - 1;
638        let mut chi_square_counts = vec![0usize; num_bins];
639
640        for &chi_sq in &chi_squares {
641            // Find which bin this chi-square value falls into
642            for i in 0..num_bins {
643                if chi_sq >= quantiles[i] && chi_sq < quantiles[i + 1] {
644                    chi_square_counts[i] += 1;
645                    break;
646                }
647            }
648        }
649
650        // Each bin should contain approximately num_runs / num_bins observations
651        let expected_per_bin = num_runs as f64 / num_bins as f64;
652        let chi_square_of_chi_squares: f64 = chi_square_counts
653            .iter()
654            .map(|&obs| {
655                let diff = (obs as f64) - expected_per_bin;
656                diff * diff / expected_per_bin
657            })
658            .sum();
659
660        // Degrees of freedom = (#bins - 1) = 9
661        // Critical χ²₀.₉₉₉ for df=9 is 27.877
662        let critical = 27.877;
663
664        println!(
665            "χ² = {}, counts = {:?}",
666            chi_square_of_chi_squares, chi_square_counts
667        );
668
669        assert!(
670            chi_square_of_chi_squares < critical,
671            "Chi-square statistics fail to follow chi-square(9) distribution: χ² = {}, counts = {:?}",
672            chi_square_of_chi_squares,
673            chi_square_counts
674        );
675    }
676
677    // Test that each element has equal probability of being selected
678    #[test]
679    fn test_sample_multiple_l_reservoir_element_probability() {
680        let population: u32 = 100;
681        let data: Vec<u32> = (0..population).collect();
682        let requested = 10;
683        let num_runs = 10000;
684        let mut selection_counts = vec![0usize; population as usize];
685
686        for run in 0..num_runs {
687            let mut rng = StdRng::seed_from_u64(42 + run as u64);
688            let sample = sample_multiple_l_reservoir(&mut rng, data.iter().cloned(), requested);
689
690            for &value in &sample {
691                selection_counts[value as usize] += 1;
692            }
693        }
694
695        // Each element should be selected with probability requested/population
696        // Expected count per element
697        let expected = (num_runs * requested) as f64 / population as f64;
698
699        // Compute chi-square statistic
700        let chi_square: f64 = selection_counts
701            .iter()
702            .map(|&obs| {
703                let diff = (obs as f64) - expected;
704                diff * diff / expected
705            })
706            .sum();
707
708        // Degrees of freedom = population - 1 = 99.
709        // Critical value uses p = 0.999 (alpha = 0.001): χ²_{0.999, 99} ≈ 148.23
710        // from the inverse chi-square CDF.
711        let critical = 148.23;
712
713        println!(
714            "χ² = {}, expected = {}, min = {}, max = {}",
715            chi_square,
716            expected,
717            selection_counts.iter().min().unwrap(),
718            selection_counts.iter().max().unwrap()
719        );
720
721        assert!(
722            chi_square < critical,
723            "Element selection probabilities are not uniform: χ² = {}",
724            chi_square
725        );
726    }
727
728    // Test reproducibility with same seed
729    #[test]
730    fn test_sample_multiple_l_reservoir_reproducibility() {
731        let data: Vec<u32> = (0..1000).collect();
732        let test_sizes = [1, 2, 5, 10, 100, 500];
733
734        for &requested in &test_sizes {
735            let seed: u64 = 12345;
736
737            let mut rng1 = StdRng::seed_from_u64(seed);
738            let sample1 = sample_multiple_l_reservoir(&mut rng1, &data, requested);
739
740            let mut rng2 = StdRng::seed_from_u64(seed);
741            let sample2 = sample_multiple_l_reservoir(&mut rng2, &data, requested);
742
743            // Verify correct sample size
744            assert_eq!(
745                sample1.len(),
746                requested,
747                "Sample size {} doesn't match requested size {}",
748                sample1.len(),
749                requested
750            );
751            assert_eq!(
752                sample2.len(),
753                requested,
754                "Sample size {} doesn't match requested size {}",
755                sample2.len(),
756                requested
757            );
758
759            // Same seed should produce identical samples
760            assert_eq!(
761                sample1, sample2,
762                "Reproducibility failed for requested={}",
763                requested
764            );
765        }
766    }
767
768    #[test]
769    fn test_sample_single_l_reservoir_reproducibility() {
770        let data: Vec<u32> = (0..1000).collect();
771        let seed: u64 = 12345;
772
773        let mut rng1 = StdRng::seed_from_u64(seed);
774        let sample1 = sample_single_l_reservoir(&mut rng1, &data);
775
776        let mut rng2 = StdRng::seed_from_u64(seed);
777        let sample2 = sample_single_l_reservoir(&mut rng2, &data);
778
779        // Same seed should produce identical samples
780        assert_eq!(sample1, sample2);
781    }
782
783    #[test]
784    fn sample_single_excluding_empty_slice_returns_none() {
785        let data: [u32; 0] = [];
786        let mut rng = StdRng::seed_from_u64(42);
787        assert_eq!(sample_single_excluding(&mut rng, &data, 7), None);
788    }
789
790    #[test]
791    fn sample_single_excluding_only_excluded_returns_none() {
792        let data = [9, 9, 9, 9];
793        let mut rng = StdRng::seed_from_u64(42);
794        assert_eq!(sample_single_excluding(&mut rng, &data, 9), None);
795    }
796
797    #[test]
798    fn sample_single_excluding_never_returns_excluded_small() {
799        let data: Vec<u32> = (0..10).collect();
800        let mut rng = StdRng::seed_from_u64(42);
801        for _ in 0..1000 {
802            let v = sample_single_excluding(&mut rng, &data, 3).unwrap();
803            assert_ne!(*v, 3);
804            assert!(*v < 10);
805        }
806    }
807
808    #[test]
809    fn sample_single_excluding_never_returns_excluded_large() {
810        // Large slice: exercises the rejection-sampling path.
811        let data: Vec<u32> = (0..1000).collect();
812        let mut rng = StdRng::seed_from_u64(42);
813        for _ in 0..1000 {
814            let v = sample_single_excluding(&mut rng, &data, 500).unwrap();
815            assert_ne!(*v, 500);
816            assert!(*v < 1000);
817        }
818    }
819
820    #[test]
821    fn sample_single_excluding_excluded_not_in_slice() {
822        let data: Vec<u32> = (0..10).collect();
823        let mut rng = StdRng::seed_from_u64(42);
824        for _ in 0..100 {
825            let v = sample_single_excluding(&mut rng, &data, 999).unwrap();
826            assert!(*v < 10);
827        }
828    }
829
830    #[test]
831    fn sample_single_excluding_falls_back_when_excluded_dominates() {
832        // 99.8% of values equal `excluded`: rejection retries get exhausted
833        // and the scan fallback runs. Both rare non-excluded values must show
834        // up across 200 samples.
835        let mut data = vec![0u32; 1000];
836        data[42] = 1;
837        data[800] = 2;
838        let mut rng = StdRng::seed_from_u64(42);
839        let mut seen: Vec<u32> = (0..200)
840            .map(|_| *sample_single_excluding(&mut rng, &data, 0).unwrap())
841            .collect();
842        seen.sort();
843        seen.dedup();
844        assert_eq!(seen, vec![1, 2]);
845    }
846
847    #[test]
848    fn sample_single_excluding_uniformity_small() {
849        // Small slice → iteration path.
850        let data: Vec<u32> = (0..20).collect();
851        let excluded = 7u32;
852        let num_runs = 50_000;
853        let mut counts = [0usize; 20];
854        let mut rng = StdRng::seed_from_u64(42);
855        for _ in 0..num_runs {
856            let v = sample_single_excluding(&mut rng, &data, excluded).unwrap();
857            counts[*v as usize] += 1;
858        }
859        assert_eq!(counts[excluded as usize], 0);
860
861        let expected = num_runs as f64 / 19.0;
862        let chi_square: f64 = counts
863            .iter()
864            .enumerate()
865            .filter(|(i, _)| *i != excluded as usize)
866            .map(|(_, &obs)| {
867                let diff = obs as f64 - expected;
868                diff * diff / expected
869            })
870            .sum();
871        // df = 18, χ²_{0.999} ≈ 42.31
872        assert!(chi_square < 42.31, "χ² = {chi_square}");
873    }
874
875    #[test]
876    fn sample_single_excluding_uniformity_large() {
877        // Large slice: rejection-sampling path.
878        let data: Vec<u32> = (0..200).collect();
879        let excluded = 99u32;
880        let num_runs = 200_000;
881        let mut counts = vec![0usize; 200];
882        let mut rng = StdRng::seed_from_u64(42);
883        for _ in 0..num_runs {
884            let v = sample_single_excluding(&mut rng, &data, excluded).unwrap();
885            counts[*v as usize] += 1;
886        }
887        assert_eq!(counts[excluded as usize], 0);
888
889        let expected = num_runs as f64 / 199.0;
890        let chi_square: f64 = counts
891            .iter()
892            .enumerate()
893            .filter(|(i, _)| *i != excluded as usize)
894            .map(|(_, &obs)| {
895                let diff = obs as f64 - expected;
896                diff * diff / expected
897            })
898            .sum();
899        // df = 198, χ²_{0.999} ≈ 264.69
900        assert!(chi_square < 264.69, "χ² = {chi_square}");
901    }
902
903    #[test]
904    #[allow(clippy::needless_borrows_for_generic_args)]
905    fn sample_single_excluding_accepts_owned_or_borrowed_excluded() {
906        // The `Borrow<T>` bound on `excluded` lets callers pass either an owned
907        // `T` or a `&T`; both must compile and produce the same answer.
908        let data: Vec<u32> = (0..10).collect();
909        let mut rng_owned = StdRng::seed_from_u64(42);
910        let mut rng_ref = StdRng::seed_from_u64(42);
911        let excluded = 3u32;
912        for _ in 0..50 {
913            let owned = sample_single_excluding(&mut rng_owned, &data, excluded);
914            let borrowed = sample_single_excluding(&mut rng_ref, &data, &excluded);
915            assert_eq!(owned, borrowed);
916        }
917    }
918
919    #[test]
920    fn sample_single_excluding_reproducibility() {
921        let data: Vec<u32> = (0..1000).collect();
922        let seed = 12345u64;
923
924        let mut rng1 = StdRng::seed_from_u64(seed);
925        let mut rng2 = StdRng::seed_from_u64(seed);
926        for _ in 0..100 {
927            let a = sample_single_excluding(&mut rng1, &data, 500);
928            let b = sample_single_excluding(&mut rng2, &data, 500);
929            assert_eq!(a, b);
930        }
931    }
932
933    #[test]
934    fn sample_single_excluding_l_reservoir_empty_returns_none() {
935        let data: [u32; 0] = [];
936        let mut rng = StdRng::seed_from_u64(42);
937        assert_eq!(
938            sample_single_excluding_l_reservoir(&mut rng, data, 7u32),
939            None
940        );
941    }
942
943    #[test]
944    fn sample_single_excluding_l_reservoir_only_excluded_returns_none() {
945        let data = [9u32, 9, 9, 9];
946        let mut rng = StdRng::seed_from_u64(42);
947        assert_eq!(
948            sample_single_excluding_l_reservoir(&mut rng, data, 9u32),
949            None
950        );
951    }
952
953    #[test]
954    fn sample_single_excluding_l_reservoir_never_returns_excluded() {
955        let data: Vec<u32> = (0..50).collect();
956        let mut rng = StdRng::seed_from_u64(42);
957        for _ in 0..1000 {
958            let v =
959                sample_single_excluding_l_reservoir(&mut rng, data.iter().copied(), 17u32).unwrap();
960            assert_ne!(v, 17);
961            assert!(v < 50);
962        }
963    }
964
965    #[test]
966    fn sample_single_excluding_l_reservoir_uniformity() {
967        let excluded = 7u32;
968        let data: Vec<u32> = (0..20).collect();
969        let num_runs = 50_000;
970        let mut counts = [0usize; 20];
971        let mut rng = StdRng::seed_from_u64(42);
972        for _ in 0..num_runs {
973            let v = sample_single_excluding_l_reservoir(&mut rng, data.iter().copied(), excluded)
974                .unwrap();
975            counts[v as usize] += 1;
976        }
977        assert_eq!(counts[excluded as usize], 0);
978
979        let expected = num_runs as f64 / 19.0;
980        let chi_square: f64 = counts
981            .iter()
982            .enumerate()
983            .filter(|(i, _)| *i != excluded as usize)
984            .map(|(_, &obs)| {
985                let diff = obs as f64 - expected;
986                diff * diff / expected
987            })
988            .sum();
989        // df = 18, χ²_{0.999} ≈ 42.31
990        assert!(chi_square < 42.31, "χ² = {chi_square}");
991    }
992
993    #[test]
994    #[allow(clippy::needless_borrows_for_generic_args)]
995    fn sample_single_excluding_l_reservoir_accepts_owned_or_borrowed_excluded() {
996        let data: Vec<u32> = (0..10).collect();
997        let excluded = 3u32;
998        let mut rng_owned = StdRng::seed_from_u64(42);
999        let mut rng_ref = StdRng::seed_from_u64(42);
1000        for _ in 0..50 {
1001            let owned =
1002                sample_single_excluding_l_reservoir(&mut rng_owned, data.iter().copied(), excluded);
1003            let borrowed =
1004                sample_single_excluding_l_reservoir(&mut rng_ref, data.iter().copied(), &excluded);
1005            assert_eq!(owned, borrowed);
1006        }
1007    }
1008
1009    #[test]
1010    fn sample_single_excluding_l_reservoir_reproducibility() {
1011        let data: Vec<u32> = (0..1000).collect();
1012        let seed = 12345u64;
1013        let mut rng1 = StdRng::seed_from_u64(seed);
1014        let mut rng2 = StdRng::seed_from_u64(seed);
1015        for _ in 0..100 {
1016            let a = sample_single_excluding_l_reservoir(&mut rng1, data.iter().copied(), 500u32);
1017            let b = sample_single_excluding_l_reservoir(&mut rng2, data.iter().copied(), 500u32);
1018            assert_eq!(a, b);
1019        }
1020    }
1021}