Skip to main content

problemreductions/models/misc/
closest_substring.rs

1//! Closest Substring problem implementation.
2//!
3//! Given a finite alphabet `{0, ..., alphabet_size - 1}`, a list of input
4//! strings (not necessarily of equal length), and a substring length `ell`,
5//! find a center string `c` of length `ell` and one length-`ell` window per
6//! input string that together minimize the maximum Hamming distance from `c`
7//! to any selected window.
8
9use crate::registry::{FieldInfo, ProblemSchemaEntry, ProblemSizeFieldEntry};
10use crate::traits::Problem;
11use crate::types::Min;
12use serde::{Deserialize, Serialize};
13
14inventory::submit! {
15    ProblemSchemaEntry {
16        name: "ClosestSubstring",
17        display_name: "Closest Substring",
18        aliases: &[],
19        dimensions: &[],
20        module_path: module_path!(),
21        description: "Find a center string of fixed length and one length-ell window per input string that minimize the maximum Hamming distance between the center and any selected window",
22        fields: &[
23            FieldInfo {
24                name: "alphabet_size",
25                type_name: "usize",
26                description: "Size q of the finite alphabet {0, ..., q-1}",
27            },
28            FieldInfo {
29                name: "strings",
30                type_name: "Vec<Vec<usize>>",
31                description: "Input strings s_1, ..., s_n over the alphabet (possibly of different lengths)",
32            },
33            FieldInfo {
34                name: "substring_length",
35                type_name: "usize",
36                description: "Common window length ell; every input string must have length at least substring_length",
37            },
38        ],
39    }
40}
41
42inventory::submit! {
43    ProblemSizeFieldEntry {
44        name: "ClosestSubstring",
45        fields: &[
46            "alphabet_size",
47            "num_strings",
48            "substring_length",
49            "total_length",
50            "total_num_windows",
51        ],
52    }
53}
54
55/// The Closest Substring problem.
56///
57/// Given a finite alphabet `Sigma = {0, ..., q - 1}`, `n` input strings
58/// `s_1, ..., s_n` over `Sigma` (not necessarily of equal length), and a
59/// window length `ell` with `ell <= |s_i|` for every `i`, find a center
60/// `c in Sigma^ell` and per-string window start positions `p_i in {0, ..., W_i - 1}`
61/// (where `W_i = |s_i| - ell + 1`) minimizing
62///
63/// `max_{1 <= i <= n} d_H(c, s_i[p_i .. p_i + ell))`,
64///
65/// where `d_H` is the Hamming distance. Every choice in the discrete cube is
66/// syntactically feasible.
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct ClosestSubstring {
69    alphabet_size: usize,
70    strings: Vec<Vec<usize>>,
71    substring_length: usize,
72}
73
74impl ClosestSubstring {
75    /// Create a new `ClosestSubstring` instance.
76    ///
77    /// # Panics
78    ///
79    /// Panics if:
80    /// - `strings` is empty (the problem requires at least one input string),
81    /// - `substring_length > |s_i|` for any input string,
82    /// - `alphabet_size == 0` while `substring_length > 0`,
83    /// - any symbol in any input string is `>= alphabet_size`.
84    pub fn new(alphabet_size: usize, strings: Vec<Vec<usize>>, substring_length: usize) -> Self {
85        assert!(
86            !strings.is_empty(),
87            "ClosestSubstring requires at least one input string"
88        );
89        assert!(
90            strings.iter().all(|s| s.len() >= substring_length),
91            "substring_length must be <= |s_i| for every input string"
92        );
93        assert!(
94            alphabet_size > 0 || substring_length == 0,
95            "alphabet_size must be > 0 when substring_length > 0"
96        );
97        assert!(
98            strings
99                .iter()
100                .flat_map(|s| s.iter())
101                .all(|&symbol| symbol < alphabet_size),
102            "input symbols must be less than alphabet_size"
103        );
104        Self {
105            alphabet_size,
106            strings,
107            substring_length,
108        }
109    }
110
111    /// Returns the alphabet size `q`.
112    pub fn alphabet_size(&self) -> usize {
113        self.alphabet_size
114    }
115
116    /// Returns the input strings.
117    pub fn strings(&self) -> &[Vec<usize>] {
118        &self.strings
119    }
120
121    /// Returns the number of input strings `n`.
122    pub fn num_strings(&self) -> usize {
123        self.strings.len()
124    }
125
126    /// Returns the common window length `ell`.
127    pub fn substring_length(&self) -> usize {
128        self.substring_length
129    }
130
131    /// Returns the sum of input string lengths.
132    pub fn total_length(&self) -> usize {
133        self.strings.iter().map(|s| s.len()).sum()
134    }
135
136    /// Returns `sum_i W_i`, where `W_i = |s_i| - substring_length + 1`.
137    pub fn total_num_windows(&self) -> usize {
138        self.strings
139            .iter()
140            .map(|s| s.len() - self.substring_length + 1)
141            .sum()
142    }
143
144    /// Returns `prod_i W_i`, the number of distinct window-selection tuples.
145    ///
146    /// Uses saturating multiplication so the value cannot overflow; callers
147    /// should treat a return of `usize::MAX` as "very large".
148    pub fn num_window_choice_product(&self) -> usize {
149        self.strings
150            .iter()
151            .map(|s| s.len() - self.substring_length + 1)
152            .fold(1usize, |acc, w| acc.saturating_mul(w))
153    }
154}
155
156impl Problem for ClosestSubstring {
157    const NAME: &'static str = "ClosestSubstring";
158    type Value = Min<i64>;
159
160    fn variant() -> Vec<(&'static str, &'static str)> {
161        crate::variant_params![]
162    }
163
164    fn dims(&self) -> Vec<usize> {
165        let ell = self.substring_length;
166        let mut dims = vec![self.alphabet_size; ell];
167        dims.extend(self.strings.iter().map(|s| s.len() - ell + 1));
168        dims
169    }
170
171    fn evaluate(&self, config: &[usize]) -> Min<i64> {
172        let ell = self.substring_length;
173        let n = self.num_strings();
174        if config.len() != ell + n {
175            return Min(None);
176        }
177        let (center, window_starts) = config.split_at(ell);
178        if center.iter().any(|&symbol| symbol >= self.alphabet_size) {
179            return Min(None);
180        }
181        for (i, &start) in window_starts.iter().enumerate() {
182            let w_i = self.strings[i].len() - ell + 1;
183            if start >= w_i {
184                return Min(None);
185            }
186        }
187        // Maximum Hamming distance from the center to the chosen window of each string.
188        let max_distance = window_starts
189            .iter()
190            .enumerate()
191            .map(|(i, &start)| {
192                let window = &self.strings[i][start..start + ell];
193                center
194                    .iter()
195                    .zip(window.iter())
196                    .filter(|(c, t)| c != t)
197                    .count() as i64
198            })
199            .max()
200            .unwrap_or(0);
201        Min(Some(max_distance))
202    }
203}
204
205crate::declare_variants! {
206    default ClosestSubstring => "alphabet_size ^ substring_length * num_window_choice_product",
207}
208
209#[cfg(feature = "example-db")]
210pub(crate) fn canonical_model_example_specs() -> Vec<crate::example_db::specs::ModelExampleSpec> {
211    vec![crate::example_db::specs::ModelExampleSpec {
212        id: "closest_substring",
213        instance: Box::new(ClosestSubstring::new(
214            2,
215            vec![
216                vec![0, 0, 0, 1, 1],
217                vec![1, 0, 1, 0, 0],
218                vec![1, 1, 0, 0, 1],
219            ],
220            3,
221        )),
222        // Center c = [0, 1, 0]; windows (0, 1, 0) selecting s_1[0..3] = 000,
223        // s_2[1..4] = 010, s_3[0..3] = 110 with distances 1, 0, 1 and radius 1.
224        optimal_config: vec![0, 1, 0, 0, 1, 0],
225        optimal_value: serde_json::json!(1),
226    }]
227}
228
229#[cfg(test)]
230#[path = "../../unit_tests/models/misc/closest_substring.rs"]
231mod tests;