Skip to main content

problemreductions/models/misc/
minimum_external_macro_data_compression.rs

1//! Minimum External Macro Data Compression problem implementation.
2//!
3//! Given an alphabet Sigma, a string s in Sigma*, and a pointer cost h,
4//! find a dictionary string D and compressed string C minimizing the total
5//! cost |D| + |C| + (h-1) * (number of pointer occurrences in D and C),
6//! such that s can be reconstructed from C by replacing pointers with their
7//! referenced substrings of D.
8//!
9//! The configuration uses 2*|s| slots: |s| slots for D (dictionary) and |s|
10//! slots for C (compressed string). D-slots use alphabet symbols or empty.
11//! C-slots use alphabet symbols, pointers into D (start, len), or empty.
12//! D is restricted to be pointer-free (pure alphabet string).
13//!
14//! This problem is NP-hard (Storer, 1977; Storer & Szymanski, 1978).
15//! Reference: Garey & Johnson A4 SR22.
16
17use crate::registry::{FieldInfo, ProblemSchemaEntry};
18use crate::traits::Problem;
19use crate::types::Min;
20use serde::{Deserialize, Serialize};
21
22inventory::submit! {
23    ProblemSchemaEntry {
24        name: "MinimumExternalMacroDataCompression",
25        display_name: "Minimum External Macro Data Compression",
26        aliases: &[],
27        dimensions: &[],
28        module_path: module_path!(),
29        description: "Find minimum-cost compression using an external dictionary and compressed string with pointers",
30        fields: &[
31            FieldInfo { name: "alphabet_size", type_name: "usize", description: "Size of the alphabet (symbols indexed 0..alphabet_size)" },
32            FieldInfo { name: "string", type_name: "Vec<usize>", description: "Source string as symbol indices" },
33            FieldInfo { name: "pointer_cost", type_name: "usize", description: "Pointer cost h (each pointer contributes h to the cost)" },
34        ],
35    }
36}
37
38/// Minimum External Macro Data Compression problem.
39///
40/// Given an alphabet of size `k`, a string `s` over `{0, ..., k-1}`, and
41/// a pointer cost `h`, find dictionary string D and compressed string C
42/// that minimize cost = |D| + |C| + (h-1) * (pointer count in C).
43///
44/// # Representation
45///
46/// The configuration is a vector of `2 * string_length` entries:
47/// - First `string_length` entries are D-slots: each is a symbol index
48///   in `{0, ..., alphabet_size-1}` or `alphabet_size` (empty/unused).
49/// - Next `string_length` entries are C-slots: each is:
50///   - A symbol index in `{0, ..., alphabet_size-1}` (literal)
51///   - `alphabet_size` (empty/unused)
52///   - A value in `{alphabet_size+1, ..., alphabet_size + |s|*(|s|+1)/2}`
53///     encoding a pointer (start, len) into D.
54///
55/// D is the prefix of non-empty D-slots. C is the prefix of non-empty C-slots.
56/// The cost is |D| + |C| + (h-1) * (number of pointer symbols in C).
57///
58/// # Example
59///
60/// ```
61/// use problemreductions::models::misc::MinimumExternalMacroDataCompression;
62/// use problemreductions::{Problem, Solver, BruteForce};
63///
64/// // Alphabet {a, b}, string "abab", pointer cost h=2
65/// let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1, 0, 1], 2);
66/// let solver = BruteForce::new();
67/// let solution = solver.find_witness(&problem);
68/// assert!(solution.is_some());
69/// ```
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct MinimumExternalMacroDataCompression {
72    alphabet_size: usize,
73    string: Vec<usize>,
74    pointer_cost: usize,
75}
76
77impl MinimumExternalMacroDataCompression {
78    /// Create a new MinimumExternalMacroDataCompression instance.
79    ///
80    /// # Panics
81    ///
82    /// Panics if `alphabet_size` is 0 and the string is non-empty, or if
83    /// any symbol in the string is >= `alphabet_size`, or if `pointer_cost` is 0.
84    pub fn new(alphabet_size: usize, string: Vec<usize>, pointer_cost: usize) -> Self {
85        assert!(
86            alphabet_size > 0 || string.is_empty(),
87            "alphabet_size must be > 0 when the string is non-empty"
88        );
89        assert!(
90            string
91                .iter()
92                .all(|&s| s < alphabet_size || alphabet_size == 0),
93            "all symbols must be less than alphabet_size"
94        );
95        assert!(pointer_cost > 0, "pointer_cost must be positive");
96        Self {
97            alphabet_size,
98            string,
99            pointer_cost,
100        }
101    }
102
103    /// Returns the length of the source string.
104    pub fn string_length(&self) -> usize {
105        self.string.len()
106    }
107
108    /// Returns the alphabet size.
109    pub fn alphabet_size(&self) -> usize {
110        self.alphabet_size
111    }
112
113    /// Returns the pointer cost h.
114    pub fn pointer_cost(&self) -> usize {
115        self.pointer_cost
116    }
117
118    /// Returns the source string.
119    pub fn string(&self) -> &[usize] {
120        &self.string
121    }
122
123    /// Returns the number of valid pointers into D (|s|*(|s|+1)/2).
124    fn num_pointers(&self) -> usize {
125        let n = self.string.len();
126        n * (n + 1) / 2
127    }
128
129    /// Returns the C-slot domain size: alphabet_size + 1 (empty) + num_pointers.
130    fn c_domain_size(&self) -> usize {
131        self.alphabet_size + 1 + self.num_pointers()
132    }
133
134    /// Decode a pointer index (offset from alphabet_size+1) into (start, len)
135    /// in the dictionary. Pointers are enumerated as:
136    /// index 0 -> (0, 1), 1 -> (0, 2), ..., n-1 -> (0, n),
137    /// n -> (1, 1), n+1 -> (1, 2), ..., etc.
138    fn decode_pointer(&self, ptr_idx: usize) -> Option<(usize, usize)> {
139        let n = self.string.len();
140        // Enumerate (start, len) pairs where 0 <= start < n, 1 <= len <= n - start
141        let mut idx = 0;
142        for start in 0..n {
143            let max_len = n - start;
144            if ptr_idx < idx + max_len {
145                let len = ptr_idx - idx + 1;
146                return Some((start, len));
147            }
148            idx += max_len;
149        }
150        None
151    }
152}
153
154impl Problem for MinimumExternalMacroDataCompression {
155    const NAME: &'static str = "MinimumExternalMacroDataCompression";
156    type Value = Min<usize>;
157
158    fn variant() -> Vec<(&'static str, &'static str)> {
159        crate::variant_params![]
160    }
161
162    fn dims(&self) -> Vec<usize> {
163        let n = self.string.len();
164        let d_domain = self.alphabet_size + 1; // symbols + empty
165        let c_domain = self.c_domain_size(); // symbols + empty + pointers
166        let mut dims = vec![d_domain; n]; // D-slots
167        dims.extend(vec![c_domain; n]); // C-slots
168        dims
169    }
170
171    fn evaluate(&self, config: &[usize]) -> Min<usize> {
172        let n = self.string.len();
173        if config.len() != 2 * n {
174            return Min(None);
175        }
176
177        // Handle empty string case
178        if n == 0 {
179            return Min(Some(0));
180        }
181
182        let empty_d = self.alphabet_size; // empty marker for D-slots
183        let empty_c = self.alphabet_size; // empty marker for C-slots
184
185        // Decode D: prefix of non-empty D-slots
186        let d_slots = &config[..n];
187        let d_len = d_slots.iter().position(|&v| v == empty_d).unwrap_or(n);
188
189        // Verify contiguous: all after first empty must be empty
190        for &v in &d_slots[d_len..] {
191            if v != empty_d {
192                return Min(None);
193            }
194        }
195
196        // Verify D symbols are valid alphabet symbols
197        let d_str: Vec<usize> = d_slots[..d_len].to_vec();
198        if d_str.iter().any(|&v| v >= self.alphabet_size) {
199            return Min(None);
200        }
201
202        // Decode C: prefix of non-empty C-slots
203        let c_slots = &config[n..];
204        let c_len = c_slots.iter().position(|&v| v == empty_c).unwrap_or(n);
205
206        // Verify contiguous: all after first empty must be empty
207        for &v in &c_slots[c_len..] {
208            if v != empty_c {
209                return Min(None);
210            }
211        }
212
213        // Decode C into a sequence of symbols, counting pointers
214        let mut decoded = Vec::new();
215        let mut pointer_count: usize = 0;
216
217        for &v in &c_slots[..c_len] {
218            if v < self.alphabet_size {
219                // Literal symbol
220                decoded.push(v);
221            } else if v > self.alphabet_size {
222                // Pointer into D
223                let ptr_idx = v - (self.alphabet_size + 1);
224                if let Some((start, len)) = self.decode_pointer(ptr_idx) {
225                    // Pointer must reference valid portion of D
226                    if start + len > d_len {
227                        return Min(None);
228                    }
229                    decoded.extend_from_slice(&d_str[start..start + len]);
230                    pointer_count += 1;
231                } else {
232                    return Min(None);
233                }
234            } else {
235                // v == empty_c, but we already filtered those out
236                return Min(None);
237            }
238        }
239
240        // Check decoded string matches the source string
241        if decoded != self.string {
242            return Min(None);
243        }
244
245        // Compute cost: |D| + |C| + (h-1) * pointer_count
246        let cost = d_len + c_len + (self.pointer_cost - 1) * pointer_count;
247        Min(Some(cost))
248    }
249}
250
251crate::declare_variants! {
252    default MinimumExternalMacroDataCompression => "(alphabet_size + 1) ^ string_length * (alphabet_size + 1 + string_length * (string_length + 1) / 2) ^ string_length",
253}
254
255#[cfg(feature = "example-db")]
256pub(crate) fn canonical_model_example_specs() -> Vec<crate::example_db::specs::ModelExampleSpec> {
257    // Issue #441 example: alphabet {a,b,c,d,e,f} (6), s="abcdefabcdefabcdef" (18), h=2.
258    // Optimal: D="abcdef"(6), C = ptr(0,6) ptr(0,6) ptr(0,6), cost = 6+3+(2-1)*3 = 12.
259    // Solved via ILP reduction (brute force infeasible at this size).
260    //
261    // Config encoding (2*18 = 36 slots):
262    // D-slots: [0,1,2,3,4,5, 6,6,...,6] (6 symbols + 12 empty, empty=alphabet_size=6)
263    // C-slots: [ptr(0,6), ptr(0,6), ptr(0,6), 6,6,...,6] (3 pointers + 15 empty)
264    // ptr(0,6) index: start=0, len=6 → index 5 → encoded as 6+1+5 = 12
265    let s: Vec<usize> = (0..6).cycle().take(18).collect();
266    let mut optimal_config = vec![0, 1, 2, 3, 4, 5];
267    optimal_config.extend(vec![6; 12]); // empty D-slots
268    optimal_config.extend(vec![12, 12, 12]); // 3 pointers to D[0..6]
269    optimal_config.extend(vec![6; 15]); // empty C-slots
270    vec![crate::example_db::specs::ModelExampleSpec {
271        id: "minimum_external_macro_data_compression",
272        instance: Box::new(MinimumExternalMacroDataCompression::new(6, s, 2)),
273        optimal_config,
274        optimal_value: serde_json::json!(12),
275    }]
276}
277
278#[cfg(test)]
279#[path = "../../unit_tests/models/misc/minimum_external_macro_data_compression.rs"]
280mod tests;