1use crate::rand::seq::index::sample as choose_range;
5use crate::rand::Rng;
6
7pub fn sample_single_from_known_length<I, R, T>(rng: &mut R, mut iter: I) -> Option<T>
20where
21 R: Rng,
22 I: Iterator<Item = T>,
23{
24 let (length, _) = iter.size_hint();
26 if length == 0 {
27 return None;
28 }
29 let index = rng.random_range(0..length as u32) as usize;
31 iter.nth(index)
33}
34
35pub fn sample_single_l_reservoir<I, R, T>(rng: &mut R, iterable: I) -> Option<T>
48where
49 R: Rng,
50 I: IntoIterator<Item = T>,
51{
52 let mut iter = iterable.into_iter();
53 let mut weight: f64 = rng.random_range(0.0..1.0); let mut chosen_item: T = iter.next()?; let mut skip = (f64::ln(rng.random_range(0.0..1.0)) / f64::ln(1.0 - weight)).floor() as usize;
59 weight *= rng.random_range(0.0..1.0);
60
61 loop {
62 match iter.nth(skip) {
63 Some(item) => {
64 chosen_item = item;
65 skip =
66 (f64::ln(rng.random_range(0.0..1.0)) / f64::ln(1.0 - weight)).floor() as usize;
67 weight *= rng.random_range(0.0..1.0);
68 }
69 None => return Some(chosen_item),
70 }
71 }
72}
73
74pub fn sample_multiple_from_known_length<I, R, T>(rng: &mut R, iter: I, requested: usize) -> Vec<T>
88where
89 R: Rng,
90 I: IntoIterator<Item = T>,
91{
92 let mut iter = iter.into_iter();
93 let (length, _) = iter.size_hint();
95
96 let mut indexes = Vec::with_capacity(requested);
97 indexes.extend(choose_range(rng, length, requested));
98 indexes.sort_unstable();
99
100 let mut selected = Vec::with_capacity(requested);
101 let mut consumed: usize = 0; for idx in indexes {
107 if let Some(item) = iter.nth(idx - consumed) {
108 selected.push(item);
109 }
110 consumed = idx + 1;
111 }
112
113 selected
114}
115
116pub fn sample_multiple_l_reservoir<I, R, T>(rng: &mut R, iter: I, requested: usize) -> Vec<T>
129where
130 R: Rng,
131 I: IntoIterator<Item = T>,
132{
133 if requested == 0 {
134 return Vec::new();
135 }
136
137 let mut weight: f64 = rng.random_range(0.0..1.0); weight = weight.powf(1.0 / requested as f64);
139 let mut iter = iter.into_iter();
140 let mut reservoir: Vec<T> = iter.by_ref().take(requested).collect(); if reservoir.len() < requested {
143 return reservoir;
144 }
145
146 let mut skip = (f64::ln(rng.random_range(0.0..1.0)) / f64::ln(1.0 - weight)).floor() as usize;
149 let uniform_random: f64 = rng.random_range(0.0..1.0);
150 weight *= uniform_random.powf(1.0 / requested as f64);
151
152 loop {
153 match iter.nth(skip) {
154 Some(item) => {
155 let to_remove = rng.random_range(0..reservoir.len());
156 reservoir.swap_remove(to_remove);
157 reservoir.push(item);
158
159 skip =
160 (f64::ln(rng.random_range(0.0..1.0)) / f64::ln(1.0 - weight)).floor() as usize;
161 let uniform_random: f64 = rng.random_range(0.0..1.0);
162 weight *= uniform_random.powf(1.0 / requested as f64);
163 }
164 None => return reservoir,
165 }
166 }
167}
168
169#[cfg(test)]
170mod tests {
171 use rand::rngs::StdRng;
172 use rand::SeedableRng;
173
174 use super::*;
175 use crate::hashing::{HashSet, HashSetExt};
176
177 #[test]
178 fn test_sample_single_l_reservoir_basic() {
179 let data: Vec<u32> = (0..1000).collect();
180 let seed: u64 = 42;
181 let mut rng = StdRng::seed_from_u64(seed);
182 let sample = sample_single_l_reservoir(&mut rng, data);
183
184 assert!(sample.is_some());
186
187 let value = sample.unwrap();
189 assert!(value < 1000);
190 }
191
192 #[test]
193 fn test_sample_single_l_reservoir_empty() {
194 let data: Vec<u32> = Vec::new();
195 let mut rng = StdRng::seed_from_u64(42);
196 let sample = sample_single_l_reservoir(&mut rng, data);
197
198 assert!(sample.is_none());
200 }
201
202 #[test]
203 fn test_sample_single_l_reservoir_single_element() {
204 let data: Vec<u32> = vec![42];
205 let mut rng = StdRng::seed_from_u64(1);
206 let sample = sample_single_l_reservoir(&mut rng, data);
207
208 assert_eq!(sample, Some(42));
210 }
211
212 #[test]
213 fn test_sample_single_l_reservoir_uniformity() {
214 let population: u32 = 1000;
215 let data: Vec<u32> = (0..population).collect();
216 let num_runs = 10000;
217 let num_bins = 10;
218 let mut counts = vec![0usize; num_bins];
219
220 for run in 0..num_runs {
221 let mut rng = StdRng::seed_from_u64(42 + run as u64);
222 let sample = sample_single_l_reservoir(&mut rng, data.iter().cloned());
223
224 if let Some(value) = sample {
225 let bin = (value as usize) / (population as usize / num_bins);
226 counts[bin] += 1;
227 }
228 }
229
230 let expected = num_runs as f64 / num_bins as f64;
232
233 let chi_square: f64 = counts
235 .iter()
236 .map(|&obs| {
237 let diff = (obs as f64) - expected;
238 diff * diff / expected
239 })
240 .sum();
241
242 let critical = 27.877;
245
246 println!("χ² = {}, counts = {:?}", chi_square, counts);
247
248 assert!(
249 chi_square < critical,
250 "Single sample fails uniformity test: χ² = {}, counts = {:?}",
251 chi_square,
252 counts
253 );
254 }
255
256 #[test]
257 fn test_sample_single_l_reservoir_hashset() {
258 let mut data = HashSet::new();
259 for i in 0..100 {
260 data.insert(i);
261 }
262
263 let mut rng = StdRng::seed_from_u64(42);
264 let sample = sample_single_l_reservoir(&mut rng, &data);
265
266 assert!(sample.is_some());
267 let value = sample.unwrap();
268 assert!(data.contains(value));
269 }
270
271 #[test]
272 fn test_sample_multiple_l_reservoir_basic() {
273 let data: Vec<u32> = (0..1000).collect();
274 let requested = 100;
275 let seed: u64 = 42;
276 let mut rng = StdRng::seed_from_u64(seed);
277 let sample = sample_multiple_l_reservoir(&mut rng, data, requested);
278
279 assert_eq!(sample.len(), requested);
281
282 assert!(sample.iter().all(|v| *v < 1000));
284
285 let unique: HashSet<_> = sample.iter().collect();
287 assert_eq!(unique.len(), sample.len());
288 }
289
290 #[test]
291 fn test_sample_multiple_l_reservoir_empty() {
292 let data: Vec<u32> = Vec::new();
293 let mut rng = StdRng::seed_from_u64(42);
294 let sample = sample_multiple_l_reservoir(&mut rng, &data, 10);
295
296 assert_eq!(sample.len(), 0);
298 }
299
300 #[test]
301 fn test_sample_multiple_l_reservoir_zero_requested() {
302 let data: Vec<u32> = (0..100).collect();
303 let mut rng = StdRng::seed_from_u64(42);
304 let sample = sample_multiple_l_reservoir(&mut rng, &data, 0);
305
306 assert_eq!(sample.len(), 0);
308 }
309
310 #[test]
311 fn test_sample_multiple_l_reservoir_requested_exceeds_population() {
312 let data: Vec<u32> = (0..50).collect();
313 let requested = 100;
314 let mut rng = StdRng::seed_from_u64(42);
315 let sample = sample_multiple_l_reservoir(&mut rng, data, requested);
316
317 assert_eq!(sample.len(), 50);
319
320 let unique: HashSet<_> = sample.iter().collect();
322 assert_eq!(unique.len(), 50);
323
324 assert!(sample.iter().all(|v| *v < 50));
326 }
327
328 #[test]
329 fn test_sample_multiple_l_reservoir_exact_population() {
330 let data: Vec<u32> = (0..100).collect();
331 let mut rng = StdRng::seed_from_u64(42);
332 let sample = sample_multiple_l_reservoir(&mut rng, data, 100);
333
334 assert_eq!(sample.len(), 100);
336
337 let unique: HashSet<_> = sample.iter().collect();
338 assert_eq!(unique.len(), 100);
339 }
340
341 #[test]
342 fn test_sample_multiple_l_reservoir_single_element() {
343 let data: Vec<u32> = vec![42];
344 let mut rng = StdRng::seed_from_u64(1);
345 let sample = sample_multiple_l_reservoir(&mut rng, data, 1);
346
347 assert_eq!(sample.len(), 1);
348 assert_eq!(sample[0], 42);
349 }
350
351 #[test]
352 fn test_sample_multiple_l_reservoir_hashset() {
353 let mut data = HashSet::new();
354 for i in 0..100 {
355 data.insert(i);
356 }
357
358 let mut rng = StdRng::seed_from_u64(42);
359 let sample = sample_multiple_l_reservoir(&mut rng, &data, 10);
360
361 assert_eq!(sample.len(), 10);
362
363 assert!(sample.iter().all(|v| data.contains(v)));
365
366 let unique: HashSet<_> = sample.iter().collect();
368 assert_eq!(unique.len(), 10);
369 }
370
371 #[test]
372 fn test_sample_multiple_l_reservoir_small_sample() {
373 let data: Vec<u32> = (0..1000).collect();
374 let requested = 5;
375 let mut rng = StdRng::seed_from_u64(42);
376 let sample = sample_multiple_l_reservoir(&mut rng, &data, requested);
377
378 assert_eq!(sample.len(), requested);
379
380 let unique: HashSet<_> = sample.iter().collect();
382 assert_eq!(unique.len(), requested);
383 }
384
385 #[test]
386 fn test_sample_multiple_l_reservoir_large_sample() {
387 let data: Vec<u32> = (0..1000).collect();
388 let requested = 900;
389 let mut rng = StdRng::seed_from_u64(42);
390 let sample = sample_multiple_l_reservoir(&mut rng, &data, requested);
391
392 assert_eq!(sample.len(), requested);
393
394 let unique: HashSet<_> = sample.iter().collect();
396 assert_eq!(unique.len(), requested);
397 }
398
399 #[test]
406 fn test_sample_multiple_l_reservoir_uniformity() {
407 let population: u32 = 10000;
408 let data: Vec<u32> = (0..population).collect();
409 let requested = 100;
410 let num_runs = 1000;
411 let mut chi_squares = Vec::with_capacity(num_runs);
412
413 for run in 0..num_runs {
414 let mut rng = StdRng::seed_from_u64(42 + run as u64);
415 let sample = sample_multiple_l_reservoir(&mut rng, data.iter().cloned(), requested);
416
417 let mut counts = [0usize; 10];
419 for &value in &sample {
420 let bin = (value as usize) / (population as usize / 10);
421 counts[bin] += 1;
422 }
423
424 let expected = requested as f64 / 10.0; let chi_square: f64 = counts
429 .iter()
430 .map(|&obs| {
431 let diff = (obs as f64) - expected;
432 diff * diff / expected
433 })
434 .sum();
435
436 chi_squares.push(chi_square);
437 }
438
439 let quantiles = [
449 0.0, 4.16816, 5.38005, 6.39331, 7.35703, 8.34283, 9.41364, 10.6564, 12.2421, 14.6837, f64::INFINITY, ];
461
462 let num_bins = quantiles.len() - 1;
463 let mut chi_square_counts = vec![0usize; num_bins];
464
465 for &chi_sq in &chi_squares {
466 for i in 0..num_bins {
468 if chi_sq >= quantiles[i] && chi_sq < quantiles[i + 1] {
469 chi_square_counts[i] += 1;
470 break;
471 }
472 }
473 }
474
475 let expected_per_bin = num_runs as f64 / num_bins as f64;
477 let chi_square_of_chi_squares: f64 = chi_square_counts
478 .iter()
479 .map(|&obs| {
480 let diff = (obs as f64) - expected_per_bin;
481 diff * diff / expected_per_bin
482 })
483 .sum();
484
485 let critical = 27.877;
488
489 println!(
490 "χ² = {}, counts = {:?}",
491 chi_square_of_chi_squares, chi_square_counts
492 );
493
494 assert!(
495 chi_square_of_chi_squares < critical,
496 "Chi-square statistics fail to follow chi-square(9) distribution: χ² = {}, counts = {:?}",
497 chi_square_of_chi_squares,
498 chi_square_counts
499 );
500 }
501
502 #[test]
504 fn test_sample_multiple_l_reservoir_element_probability() {
505 let population: u32 = 100;
506 let data: Vec<u32> = (0..population).collect();
507 let requested = 10;
508 let num_runs = 10000;
509 let mut selection_counts = vec![0usize; population as usize];
510
511 for run in 0..num_runs {
512 let mut rng = StdRng::seed_from_u64(42 + run as u64);
513 let sample = sample_multiple_l_reservoir(&mut rng, data.iter().cloned(), requested);
514
515 for &value in &sample {
516 selection_counts[value as usize] += 1;
517 }
518 }
519
520 let expected = (num_runs * requested) as f64 / population as f64;
523
524 let chi_square: f64 = selection_counts
526 .iter()
527 .map(|&obs| {
528 let diff = (obs as f64) - expected;
529 diff * diff / expected
530 })
531 .sum();
532
533 let critical = 148.23;
537
538 println!(
539 "χ² = {}, expected = {}, min = {}, max = {}",
540 chi_square,
541 expected,
542 selection_counts.iter().min().unwrap(),
543 selection_counts.iter().max().unwrap()
544 );
545
546 assert!(
547 chi_square < critical,
548 "Element selection probabilities are not uniform: χ² = {}",
549 chi_square
550 );
551 }
552
553 #[test]
555 fn test_sample_multiple_l_reservoir_reproducibility() {
556 let data: Vec<u32> = (0..1000).collect();
557 let test_sizes = [1, 2, 5, 10, 100, 500];
558
559 for &requested in &test_sizes {
560 let seed: u64 = 12345;
561
562 let mut rng1 = StdRng::seed_from_u64(seed);
563 let sample1 = sample_multiple_l_reservoir(&mut rng1, &data, requested);
564
565 let mut rng2 = StdRng::seed_from_u64(seed);
566 let sample2 = sample_multiple_l_reservoir(&mut rng2, &data, requested);
567
568 assert_eq!(
570 sample1.len(),
571 requested,
572 "Sample size {} doesn't match requested size {}",
573 sample1.len(),
574 requested
575 );
576 assert_eq!(
577 sample2.len(),
578 requested,
579 "Sample size {} doesn't match requested size {}",
580 sample2.len(),
581 requested
582 );
583
584 assert_eq!(
586 sample1, sample2,
587 "Reproducibility failed for requested={}",
588 requested
589 );
590 }
591 }
592
593 #[test]
594 fn test_sample_single_l_reservoir_reproducibility() {
595 let data: Vec<u32> = (0..1000).collect();
596 let seed: u64 = 12345;
597
598 let mut rng1 = StdRng::seed_from_u64(seed);
599 let sample1 = sample_single_l_reservoir(&mut rng1, &data);
600
601 let mut rng2 = StdRng::seed_from_u64(seed);
602 let sample2 = sample_single_l_reservoir(&mut rng2, &data);
603
604 assert_eq!(sample1, sample2);
606 }
607}