Skip to main content

problemreductions/rules/
closestsubstring_ilp.rs

1//! Reduction from ClosestSubstring to ILP (Integer Linear Programming).
2//!
3//! Given an alphabet of size `q`, `n` input strings `s_1, ..., s_n` (not
4//! necessarily of equal length), and a window length `ell`, the goal is to
5//! pick a center `c in Sigma^ell` and one length-`ell` window from each input
6//! string that together minimize the worst-case Hamming distance between the
7//! center and any chosen window. The ILP encoding combines the
8//! center-selection variables of ClosestString with one-hot window-choice
9//! indicators, plus a radius variable that is active only on each selected
10//! window.
11//!
12//! - Integer `x_{r, a}` for `r in {0, ..., ell - 1}` and
13//!   `a in {0, ..., q - 1}`: `x_{r, a} = 1` iff the center has symbol `a` at
14//!   position `r`. The non-negativity of ILP variables together with the
15//!   assignment constraint forces every `x_{r, a} in {0, 1}`.
16//! - Integer `y_{i, p}` for input string `s_i` and window start
17//!   `p in {0, ..., W_i - 1}` where `W_i = |s_i| - ell + 1`: `y_{i, p} = 1` iff
18//!   window `p` is selected from string `s_i`.
19//! - Nonnegative integer radius variable `R`.
20//! - Assignment constraint: `sum_a x_{r, a} = 1` for every position `r`.
21//! - Window-choice constraint: `sum_p y_{i, p} = 1` for every input string.
22//! - Conditional radius constraint per `(i, p)`:
23//!   `R + sum_{r} x_{r, s_i[p + r]} - ell * y_{i, p} >= 0`.
24//!   When `y_{i, p} = 1`, this becomes `R >= d_H(c, s_i[p..p + ell))`; when
25//!   `y_{i, p} = 0`, the constraint is automatically satisfied.
26//! - Objective: minimize `R`.
27//!
28//! Reference: Ming Li, Bin Ma, and Lusheng Wang, "On the closest string and
29//! substring problems," Journal of the ACM 49(2):157-171, 2002.
30//! <https://doi.org/10.1145/506147.506150>
31
32use crate::models::algebraic::{LinearConstraint, ObjectiveSense, ILP};
33use crate::models::misc::ClosestSubstring;
34use crate::reduction;
35use crate::rules::traits::{ReduceTo, ReductionResult};
36
37/// Result of reducing ClosestSubstring to ILP.
38///
39/// Variable layout (`ILP<i32>`, all non-negative):
40/// - `x_{r, a}` at index `r * alphabet_size + a` for `r in [0, ell)` and
41///   `a in [0, q)`, forced into `{0, 1}` by the assignment constraints.
42/// - `y_{i, p}` at index `q * ell + window_offsets[i] + p` for input string
43///   `s_i` and window start `p in [0, W_i)`, forced into `{0, 1}` by the
44///   window-choice constraints.
45/// - `R` (radius) at index `q * ell + total_num_windows`, a non-negative
46///   integer in `[0, ell]`.
47#[derive(Debug, Clone)]
48pub struct ReductionClosestSubstringToILP {
49    target: ILP<i32>,
50    alphabet_size: usize,
51    substring_length: usize,
52    /// Prefix sums of per-string window counts: `window_offsets[i]` is the
53    /// number of `y_{j, p}` variables for `j < i`. Has length `num_strings`.
54    window_offsets: Vec<usize>,
55    /// `window_counts[i] = W_i = |s_i| - ell + 1`.
56    window_counts: Vec<usize>,
57}
58
59impl ReductionResult for ReductionClosestSubstringToILP {
60    type Source = ClosestSubstring;
61    type Target = ILP<i32>;
62
63    fn target_problem(&self) -> &ILP<i32> {
64        &self.target
65    }
66
67    /// Decode the integer ILP assignment into the source config layout.
68    ///
69    /// `ClosestSubstring::evaluate` expects `config` of length `ell + n`: the
70    /// first `ell` entries are the center symbols, the remaining `n` entries
71    /// are per-string window starts. For each center position `r`, we pick the
72    /// unique alphabet symbol `a` with `x_{r, a} = 1`; for each input string
73    /// `s_i`, we pick the unique window start `p` with `y_{i, p} = 1`. When no
74    /// indicator is set to 1 in some block (which only happens on partial /
75    /// infeasible ILP solutions), we fall back to 0 so the returned vector
76    /// still has the expected shape.
77    fn extract_solution(&self, target_solution: &[usize]) -> Vec<usize> {
78        let q = self.alphabet_size;
79        let ell = self.substring_length;
80        let y_base = q * ell;
81
82        let mut out = Vec::with_capacity(ell + self.window_counts.len());
83
84        // Center symbols.
85        for r in 0..ell {
86            let symbol = (0..q)
87                .find(|&a| target_solution.get(r * q + a).copied().unwrap_or(0) == 1)
88                .unwrap_or(0);
89            out.push(symbol);
90        }
91
92        // Window starts.
93        for (i, &w_i) in self.window_counts.iter().enumerate() {
94            let start = (0..w_i)
95                .find(|&p| {
96                    target_solution
97                        .get(y_base + self.window_offsets[i] + p)
98                        .copied()
99                        .unwrap_or(0)
100                        == 1
101                })
102                .unwrap_or(0);
103            out.push(start);
104        }
105
106        out
107    }
108}
109
110#[reduction(
111    overhead = {
112        num_vars = "alphabet_size * substring_length + total_num_windows + 1",
113        num_constraints = "substring_length + num_strings + total_num_windows + 1",
114    }
115)]
116impl ReduceTo<ILP<i32>> for ClosestSubstring {
117    type Result = ReductionClosestSubstringToILP;
118
119    fn reduce_to(&self) -> Self::Result {
120        let q = self.alphabet_size();
121        let ell = self.substring_length();
122        let strings = self.strings();
123        let n = strings.len();
124
125        let window_counts: Vec<usize> = strings.iter().map(|s| s.len() - ell + 1).collect();
126        let mut window_offsets: Vec<usize> = Vec::with_capacity(n);
127        {
128            let mut acc = 0usize;
129            for &w in &window_counts {
130                window_offsets.push(acc);
131                acc += w;
132            }
133        }
134        let total_windows: usize = window_counts.iter().sum();
135
136        let x_idx = |r: usize, a: usize| -> usize { r * q + a };
137        let y_base = q * ell;
138        let y_idx = |i: usize, p: usize| -> usize { y_base + window_offsets[i] + p };
139        let r_idx = y_base + total_windows;
140        let num_vars = r_idx + 1;
141
142        let mut constraints: Vec<LinearConstraint> =
143            Vec::with_capacity(ell + n + total_windows + 1);
144
145        // Assignment constraints: exactly one symbol per center position.
146        // Together with the non-negativity built into ILP<i32>, this also
147        // forces every x_{r, a} to lie in {0, 1}.
148        for r in 0..ell {
149            let terms: Vec<(usize, f64)> = (0..q).map(|a| (x_idx(r, a), 1.0)).collect();
150            constraints.push(LinearConstraint::eq(terms, 1.0));
151        }
152
153        // Tight upper bound on R: the worst-case Hamming distance over a
154        // length-ell window is at most ell. Added as a single-term `<=`
155        // constraint so the solver's bound-tightening pass (which scans for
156        // exactly this pattern) picks it up. Without this, R defaults to the
157        // full i32 domain, which severely degrades HiGHS performance even on
158        // tiny instances.
159        constraints.push(LinearConstraint::le(vec![(r_idx, 1.0)], ell as f64));
160
161        // Window-choice constraints: exactly one window per input string.
162        // Combined with non-negativity, this forces every y_{i, p} in {0, 1}.
163        for (i, &w_i) in window_counts.iter().enumerate() {
164            let terms: Vec<(usize, f64)> = (0..w_i).map(|p| (y_idx(i, p), 1.0)).collect();
165            constraints.push(LinearConstraint::eq(terms, 1.0));
166        }
167
168        // Conditional radius constraints: for every (input string, window
169        // start) pair, R + sum_r x_{r, s_i[p + r]} - ell * y_{i, p} >= 0.
170        // - If y_{i, p} = 1: R >= ell - sum_r x_{r, s_i[p + r]} = d_H(c, window).
171        // - If y_{i, p} = 0: the LHS is R + (nonneg match count) >= 0,
172        //   automatically satisfied because R >= 0.
173        for (i, s) in strings.iter().enumerate() {
174            for p in 0..window_counts[i] {
175                let mut terms: Vec<(usize, f64)> = Vec::with_capacity(ell + 2);
176                terms.push((r_idx, 1.0));
177                for r in 0..ell {
178                    terms.push((x_idx(r, s[p + r]), 1.0));
179                }
180                terms.push((y_idx(i, p), -(ell as f64)));
181                constraints.push(LinearConstraint::ge(terms, 0.0));
182            }
183        }
184
185        // Objective: minimize R.
186        let objective = vec![(r_idx, 1.0)];
187
188        let target = ILP::new(num_vars, constraints, objective, ObjectiveSense::Minimize);
189
190        ReductionClosestSubstringToILP {
191            target,
192            alphabet_size: q,
193            substring_length: ell,
194            window_offsets,
195            window_counts,
196        }
197    }
198}
199
200#[cfg(feature = "example-db")]
201pub(crate) fn canonical_rule_example_specs() -> Vec<crate::example_db::specs::RuleExampleSpec> {
202    vec![crate::example_db::specs::RuleExampleSpec {
203        id: "closestsubstring_to_ilp",
204        build: || {
205            // Canonical issue #1033 instance: binary alphabet, length-3
206            // windows on three length-5 strings. Optimum radius is 1; one
207            // optimal center is 010 with windows (0, 1, 0) selecting 000,
208            // 010, 110 from s_1, s_2, s_3 respectively.
209            let source = ClosestSubstring::new(
210                2,
211                vec![
212                    vec![0, 0, 0, 1, 1],
213                    vec![1, 0, 1, 0, 0],
214                    vec![1, 1, 0, 0, 1],
215                ],
216                3,
217            );
218            crate::example_db::specs::rule_example_via_ilp::<_, i32>(source)
219        },
220    }]
221}
222
223#[cfg(test)]
224#[path = "../unit_tests/rules/closestsubstring_ilp.rs"]
225mod tests;