Import difflib 0.4.0

author DongHun Kwak <dh0128.kwak@samsung.com>

Tue, 7 Nov 2023 07:24:54 +0000 (16:24 +0900)

committer DongHun Kwak <dh0128.kwak@samsung.com>

Tue, 7 Nov 2023 07:24:54 +0000 (16:24 +0900)
author DongHun Kwak <dh0128.kwak@samsung.com>
Tue, 7 Nov 2023 07:24:54 +0000 (16:24 +0900)
committer DongHun Kwak <dh0128.kwak@samsung.com>
Tue, 7 Nov 2023 07:24:54 +0000 (16:24 +0900)
diff --git a/Cargo.toml b/Cargo.toml

new file mode 100644 (file)

index 0000000..29a094f
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,26 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g. crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+name = "difflib"
+version = "0.4.0"
+authors = ["Dima Kudosh <dimakudosh@gmail.com>"]
+include = ["**/*.rs", "Cargo.toml"]
+description = "Port of Python's difflib library to Rust."
+homepage = "https://github.com/DimaKudosh/difflib"
+documentation = "https://github.com/DimaKudosh/difflib/wiki"
+keywords = ["difflib", "text", "diff"]
+license = "MIT"
+repository = "https://github.com/DimaKudosh/difflib"
+
+[[test]]
+name = "tests"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig

new file mode 100644 (file)

index 0000000..2edf6df
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,19 @@
+[package]
+name = "difflib"
+version = "0.4.0"
+authors = ["Dima Kudosh <dimakudosh@gmail.com>"]
+description = "Port of Python's difflib library to Rust."
+documentation = "https://github.com/DimaKudosh/difflib/wiki"
+homepage = "https://github.com/DimaKudosh/difflib"
+repository = "https://github.com/DimaKudosh/difflib"
+keywords = ["difflib", "text", "diff"]
+license = "MIT"
+include = [
+    "**/*.rs",
+    "Cargo.toml",
+]
+
+
+[[test]]
+name = "tests"
+
diff --git a/examples/example.rs b/examples/example.rs

new file mode 100644 (file)

index 0000000..3f06d0a
--- /dev/null
+++ b/examples/example.rs
@@ -0,0 +1,63 @@
+extern crate difflib;
+
+use difflib::differ::Differ;
+use difflib::sequencematcher::SequenceMatcher;
+
+fn main() {
+    // unified_diff
+    let first_text = "one two three four".split(" ").collect::<Vec<&str>>();
+    let second_text = "zero one tree four".split(" ").collect::<Vec<&str>>();
+    let diff = difflib::unified_diff(
+        &first_text,
+        &second_text,
+        "Original",
+        "Current",
+        "2005-01-26 23:30:50",
+        "2010-04-02 10:20:52",
+        3,
+    );
+    for line in &diff {
+        println!("{:?}", line);
+    }
+
+    //context_diff
+    let diff = difflib::context_diff(
+        &first_text,
+        &second_text,
+        "Original",
+        "Current",
+        "2005-01-26 23:30:50",
+        "2010-04-02 10:20:52",
+        3,
+    );
+    for line in &diff {
+        println!("{:?}", line);
+    }
+
+    //get_close_matches
+    let words = vec!["ape", "apple", "peach", "puppy"];
+    let result = difflib::get_close_matches("appel", words, 3, 0.6);
+    println!("{:?}", result);
+
+    //Differ examples
+    let differ = Differ::new();
+    let diff = differ.compare(&first_text, &second_text);
+    for line in &diff {
+        println!("{:?}", line);
+    }
+
+    //SequenceMatcher examples
+    let mut matcher = SequenceMatcher::new("one two three four", "zero one tree four");
+    let m = matcher.find_longest_match(0, 18, 0, 18);
+    println!("{:?}", m);
+    let all_matches = matcher.get_matching_blocks();
+    println!("{:?}", all_matches);
+    let opcode = matcher.get_opcodes();
+    println!("{:?}", opcode);
+    let grouped_opcodes = matcher.get_grouped_opcodes(2);
+    println!("{:?}", grouped_opcodes);
+    let ratio = matcher.ratio();
+    println!("{:?}", ratio);
+    matcher.set_seqs("aaaaa", "aaaab");
+    println!("{:?}", matcher.ratio());
+}
diff --git a/src/differ.rs b/src/differ.rs

new file mode 100644 (file)

index 0000000..5caef9a
--- /dev/null
+++ b/src/differ.rs
@@ -0,0 +1,340 @@
+use sequencematcher::SequenceMatcher;
+use std::cmp;
+use utils::{count_leading, str_with_similar_chars};
+
+#[derive(Default)]
+pub struct Differ {
+    pub line_junk: Option<fn(&&str) -> bool>,
+    pub char_junk: Option<fn(&char) -> bool>,
+}
+
+impl Differ {
+    pub fn new() -> Differ {
+        Differ {
+            line_junk: None,
+            char_junk: None,
+        }
+    }
+
+    pub fn compare(&self, first_sequence: &[&str], second_sequence: &[&str]) -> Vec<String> {
+        let mut matcher = SequenceMatcher::new(first_sequence, second_sequence);
+        matcher.set_is_junk(self.line_junk);
+        let mut res = Vec::new();
+        for opcode in matcher.get_opcodes() {
+            let mut gen = Vec::new();
+            match opcode.tag.as_ref() {
+                "replace" => {
+                    gen = self.fancy_replace(
+                        first_sequence,
+                        opcode.first_start,
+                        opcode.first_end,
+                        second_sequence,
+                        opcode.second_start,
+                        opcode.second_end,
+                    )
+                }
+                "delete" => {
+                    gen = self.dump("-", first_sequence, opcode.first_start, opcode.first_end)
+                }
+                "insert" => {
+                    gen = self.dump("+", second_sequence, opcode.second_start, opcode.second_end)
+                }
+                "equal" => {
+                    gen = self.dump(" ", first_sequence, opcode.first_start, opcode.first_end)
+                }
+                _ => {}
+            }
+            for i in gen {
+                res.push(i);
+            }
+        }
+        res
+    }
+
+    fn dump(&self, tag: &str, sequence: &[&str], start: usize, end: usize) -> Vec<String> {
+        let mut res = Vec::new();
+        for i in start..end {
+            if let Some(s) = sequence.get(i) {
+                res.push(format!("{} {}", tag, s))
+            }
+        }
+        res
+    }
+
+    fn plain_replace(
+        &self,
+        first_sequence: &[&str],
+        first_start: usize,
+        first_end: usize,
+        second_sequence: &[&str],
+        second_start: usize,
+        second_end: usize,
+    ) -> Vec<String> {
+        if !(first_start < first_end && second_start < second_end) {
+            return Vec::new();
+        }
+        let (mut first, second) = if second_end - second_start < first_end - first_start {
+            (
+                self.dump("+", second_sequence, second_start, second_end),
+                self.dump("-", first_sequence, first_start, first_end),
+            )
+        } else {
+            (
+                self.dump("-", first_sequence, first_start, first_end),
+                self.dump("+", second_sequence, second_start, second_end),
+            )
+        };
+        for s in second {
+            first.push(s);
+        }
+        first
+    }
+
+    fn fancy_replace(
+        &self,
+        first_sequence: &[&str],
+        first_start: usize,
+        first_end: usize,
+        second_sequence: &[&str],
+        second_start: usize,
+        second_end: usize,
+    ) -> Vec<String> {
+        let mut res = Vec::new();
+        let (mut best_ratio, cutoff) = (0.74, 0.75);
+        let (mut best_i, mut best_j) = (0, 0);
+        let mut eqi: Option<usize> = None;
+        let mut eqj: Option<usize> = None;
+        for (j, second_sequence_str) in second_sequence
+            .iter()
+            .enumerate()
+            .take(second_end)
+            .skip(second_start)
+        {
+            for (i, first_sequence_str) in first_sequence
+                .iter()
+                .enumerate()
+                .take(second_end)
+                .skip(second_start)
+            {
+                if first_sequence_str == second_sequence_str {
+                    if eqi.is_none() {
+                        eqi = Some(i);
+                        eqj = Some(j);
+                    }
+                    continue;
+                }
+                let (first_sequence_chars, second_sequence_chars) = (
+                    first_sequence_str.chars().collect::<Vec<char>>(),
+                    second_sequence_str.chars().collect::<Vec<char>>(),
+                );
+                let mut cruncher =
+                    SequenceMatcher::new(&first_sequence_chars, &second_sequence_chars);
+                cruncher.set_is_junk(self.char_junk);
+                if cruncher.ratio() > best_ratio {
+                    best_ratio = cruncher.ratio();
+                    best_i = i;
+                    best_j = j;
+                }
+            }
+        }
+        if best_ratio < cutoff {
+            if eqi.is_none() {
+                res.extend(
+                    self.plain_replace(
+                        first_sequence,
+                        first_start,
+                        first_end,
+                        second_sequence,
+                        second_start,
+                        second_end,
+                    ).iter()
+                        .cloned(),
+                );
+                return res;
+            }
+            best_i = eqi.unwrap();
+            best_j = eqj.unwrap();
+        } else {
+            eqi = None;
+        }
+        res.extend(
+            self.fancy_helper(
+                first_sequence,
+                first_start,
+                best_i,
+                second_sequence,
+                second_start,
+                best_j,
+            ).iter()
+                .cloned(),
+        );
+        let first_element = &first_sequence[best_i];
+        let second_element = &second_sequence[best_j];
+        if eqi.is_none() {
+            let (mut first_tag, mut second_tag) = (String::new(), String::new());
+            let first_element_chars: Vec<char> = first_element.chars().collect();
+            let second_element_chars: Vec<char> = second_element.chars().collect();
+            let mut cruncher = SequenceMatcher::new(&first_element_chars, &second_element_chars);
+            cruncher.set_is_junk(self.char_junk);
+            for opcode in &cruncher.get_opcodes() {
+                let (first_length, second_length) = (
+                    opcode.first_end - opcode.first_start,
+                    opcode.second_end - opcode.second_start,
+                );
+                match opcode.tag.as_ref() {
+                    "replace" => {
+                        first_tag.push_str(&str_with_similar_chars('^', first_length));
+                        second_tag.push_str(&str_with_similar_chars('^', second_length));
+                    }
+                    "delete" => {
+                        first_tag.push_str(&str_with_similar_chars('-', first_length));
+                    }
+                    "insert" => {
+                        second_tag.push_str(&str_with_similar_chars('+', second_length));
+                    }
+                    "equal" => {
+                        first_tag.push_str(&str_with_similar_chars(' ', first_length));
+                        second_tag.push_str(&str_with_similar_chars(' ', second_length));
+                    }
+                    _ => {}
+                }
+            }
+            res.extend(
+                self.qformat(&first_element, &second_element, &first_tag, &second_tag)
+                    .iter()
+                    .cloned(),
+            );
+        } else {
+            let mut s = String::from("  ");
+            s.push_str(&first_element);
+            res.push(s);
+        }
+        res.extend(
+            self.fancy_helper(
+                first_sequence,
+                best_i + 1,
+                first_end,
+                second_sequence,
+                best_j + 1,
+                second_end,
+            ).iter()
+                .cloned(),
+        );
+        res
+    }
+
+    fn fancy_helper(
+        &self,
+        first_sequence: &[&str],
+        first_start: usize,
+        first_end: usize,
+        second_sequence: &[&str],
+        second_start: usize,
+        second_end: usize,
+    ) -> Vec<String> {
+        let mut res = Vec::new();
+        if first_start < first_end {
+            if second_start < second_end {
+                res = self.fancy_replace(
+                    first_sequence,
+                    first_start,
+                    first_end,
+                    second_sequence,
+                    second_start,
+                    second_end,
+                );
+            } else {
+                res = self.dump("-", first_sequence, first_start, first_end);
+            }
+        } else if second_start < second_end {
+            res = self.dump("+", second_sequence, second_start, second_end);
+        }
+        res
+    }
+
+    fn qformat(
+        &self,
+        first_line: &str,
+        second_line: &str,
+        first_tags: &str,
+        second_tags: &str,
+    ) -> Vec<String> {
+        let mut res = Vec::new();
+        let mut first_tags = first_tags;
+        let mut second_tags = second_tags;
+        let mut common = cmp::min(
+            count_leading(first_line, '\t'),
+            count_leading(second_line, '\t'),
+        );
+        common = cmp::min(common, count_leading(first_tags.split_at(common).0, ' '));
+        common = cmp::min(common, count_leading(first_tags.split_at(common).0, ' '));
+        first_tags = first_tags.split_at(common).1.trim_right();
+        second_tags = second_tags.split_at(common).1.trim_right();
+        let mut s = format!("- {}", first_line);
+        res.push(s);
+        if first_tags != "" {
+            s = format!("? {}{}\n", str_with_similar_chars('\t', common), first_tags);
+            res.push(s);
+        }
+        s = format!("+ {}", second_line);
+        res.push(s);
+        if second_tags != "" {
+            s = format!(
+                "? {}{}\n",
+                str_with_similar_chars('\t', common),
+                second_tags
+            );
+            res.push(s);
+        }
+        res
+    }
+
+    pub fn restore(delta: &[String], which: usize) -> Vec<String> {
+        if !(which == 1 || which == 2) {
+            panic!("Second parameter must be 1 or 2");
+        }
+        let mut res = Vec::new();
+        let tag = if which == 1 { "- " } else { "+ " }.to_string();
+        let prefixes = vec![tag, "  ".to_string()];
+        for line in delta {
+            for prefix in &prefixes {
+                if line.starts_with(prefix) {
+                    res.push(line.split_at(2).1.to_string());
+                }
+            }
+        }
+        res
+    }
+}
+
+#[test]
+fn test_fancy_replace() {
+    let differ = Differ::new();
+    let result = differ
+        .fancy_replace(&vec!["abcDefghiJkl\n"], 0, 1, &vec!["abcdefGhijkl\n"], 0, 1)
+        .join("");
+    assert_eq!(
+        result,
+        "- abcDefghiJkl\n?    ^  ^  ^\n+ abcdefGhijkl\n?    ^  ^  ^\n"
+    );
+}
+
+#[test]
+fn test_qformat() {
+    let differ = Differ::new();
+    let result = differ.qformat(
+        "\tabcDefghiJkl\n",
+        "\tabcdefGhijkl\n",
+        "  ^ ^  ^      ",
+        "  ^ ^  ^      ",
+    );
+    assert_eq!(
+        result,
+        vec![
+            "- \tabcDefghiJkl\n",
+            "? \t ^ ^  ^\n",
+            "+ \tabcdefGhijkl\n",
+            "? \t ^ ^  ^\n",
+        ]
+    );
+}
diff --git a/src/lib.rs b/src/lib.rs

new file mode 100644 (file)

index 0000000..ca6b1cc
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,169 @@
+pub mod differ;
+pub mod sequencematcher;
+mod utils;
+
+use sequencematcher::{Sequence, SequenceMatcher};
+use std::collections::HashMap;
+use std::fmt::Display;
+use utils::{format_range_context, format_range_unified};
+
+pub fn get_close_matches<'a>(
+    word: &str,
+    possibilities: Vec<&'a str>,
+    n: usize,
+    cutoff: f32,
+) -> Vec<&'a str> {
+    if !(0.0 <= cutoff && cutoff <= 1.0) {
+        panic!("Cutoff must be greater than 0.0 and lower than 1.0");
+    }
+    let mut res: Vec<(f32, &str)> = Vec::new();
+    let mut matcher = SequenceMatcher::new("", word);
+    for i in &possibilities {
+        matcher.set_first_seq(i);
+        let ratio = matcher.ratio();
+        if ratio >= cutoff {
+            res.push((ratio, i));
+        }
+    }
+    res.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap());
+    res.truncate(n);
+    res.iter().map(|x| x.1).collect()
+}
+
+pub fn unified_diff<T: Sequence + Display>(
+    first_sequence: &[T],
+    second_sequence: &[T],
+    from_file: &str,
+    to_file: &str,
+    from_file_date: &str,
+    to_file_date: &str,
+    n: usize,
+) -> Vec<String> {
+    let mut res = Vec::new();
+    let lineterm = '\n';
+    let mut started = false;
+    let mut matcher = SequenceMatcher::new(first_sequence, second_sequence);
+    for group in &matcher.get_grouped_opcodes(n) {
+        if !started {
+            started = true;
+            let from_date = format!("\t{}", from_file_date);
+            let to_date = format!("\t{}", to_file_date);
+            res.push(format!("--- {}{}{}", from_file, from_date, lineterm));
+            res.push(format!("+++ {}{}{}", to_file, to_date, lineterm));
+        }
+        let (first, last) = (group.first().unwrap(), group.last().unwrap());
+        let file1_range = format_range_unified(first.first_start, last.first_end);
+        let file2_range = format_range_unified(first.second_start, last.second_end);
+        res.push(format!(
+            "@@ -{} +{} @@{}",
+            file1_range, file2_range, lineterm
+        ));
+        for code in group {
+            if code.tag == "equal" {
+                for item in first_sequence
+                    .iter()
+                    .take(code.first_end)
+                    .skip(code.first_start)
+                {
+                    res.push(format!(" {}", item));
+                }
+                continue;
+            }
+            if code.tag == "replace" || code.tag == "delete" {
+                for item in first_sequence
+                    .iter()
+                    .take(code.first_end)
+                    .skip(code.first_start)
+                {
+                    res.push(format!("-{}", item));
+                }
+            }
+            if code.tag == "replace" || code.tag == "insert" {
+                for item in second_sequence
+                    .iter()
+                    .take(code.second_end)
+                    .skip(code.second_start)
+                {
+                    res.push(format!("+{}", item));
+                }
+            }
+        }
+    }
+    res
+}
+
+pub fn context_diff<T: Sequence + Display>(
+    first_sequence: &[T],
+    second_sequence: &[T],
+    from_file: &str,
+    to_file: &str,
+    from_file_date: &str,
+    to_file_date: &str,
+    n: usize,
+) -> Vec<String> {
+    let mut res = Vec::new();
+    let lineterm = '\n';
+    let mut prefix: HashMap<String, String> = HashMap::new();
+    prefix.insert(String::from("insert"), String::from("+ "));
+    prefix.insert(String::from("delete"), String::from("- "));
+    prefix.insert(String::from("replace"), String::from("! "));
+    prefix.insert(String::from("equal"), String::from("  "));
+    let mut started = false;
+    let mut matcher = SequenceMatcher::new(first_sequence, second_sequence);
+    for group in &matcher.get_grouped_opcodes(n) {
+        if !started {
+            started = true;
+            let from_date = format!("\t{}", from_file_date);
+            let to_date = format!("\t{}", to_file_date);
+            res.push(format!("*** {}{}{}", from_file, from_date, lineterm));
+            res.push(format!("--- {}{}{}", to_file, to_date, lineterm));
+        }
+        let (first, last) = (group.first().unwrap(), group.last().unwrap());
+        res.push(format!("***************{}", lineterm));
+        let file1_range = format_range_context(first.first_start, last.first_end);
+        res.push(format!("*** {} ****{}", file1_range, lineterm));
+        let mut any = false;
+        for opcode in group {
+            if opcode.tag == "replace" || opcode.tag == "delete" {
+                any = true;
+                break;
+            }
+        }
+        if any {
+            for opcode in group {
+                if opcode.tag != "insert" {
+                    for item in first_sequence
+                        .iter()
+                        .take(opcode.first_end)
+                        .skip(opcode.first_start)
+                    {
+                        res.push(format!("{}{}", &prefix[&opcode.tag], item));
+                    }
+                }
+            }
+        }
+        let file2_range = format_range_context(first.second_start, last.second_end);
+        res.push(format!("--- {} ----{}", file2_range, lineterm));
+        any = false;
+        for opcode in group {
+            if opcode.tag == "replace" || opcode.tag == "insert" {
+                any = true;
+                break;
+            }
+        }
+        if any {
+            for opcode in group {
+                if opcode.tag != "delete" {
+                    for item in second_sequence
+                        .iter()
+                        .take(opcode.second_end)
+                        .skip(opcode.second_start)
+                    {
+                        res.push(format!("{}{}", prefix[&opcode.tag], item));
+                    }
+                }
+            }
+        }
+    }
+    res
+}
diff --git a/src/sequencematcher.rs b/src/sequencematcher.rs

new file mode 100644 (file)

index 0000000..4157808
--- /dev/null
+++ b/src/sequencematcher.rs
@@ -0,0 +1,346 @@
+use std::cmp::{max, min};
+use std::collections::HashMap;
+use std::hash::Hash;
+use utils::calculate_ratio;
+
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
+pub struct Match {
+    pub first_start: usize,
+    pub second_start: usize,
+    pub size: usize,
+}
+
+impl Match {
+    fn new(first_start: usize, second_start: usize, size: usize) -> Match {
+        Match {
+            first_start,
+            second_start,
+            size,
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct Opcode {
+    pub tag: String,
+    pub first_start: usize,
+    pub first_end: usize,
+    pub second_start: usize,
+    pub second_end: usize,
+}
+
+impl Opcode {
+    fn new(
+        tag: String,
+        first_start: usize,
+        first_end: usize,
+        second_start: usize,
+        second_end: usize,
+    ) -> Opcode {
+        Opcode {
+            tag,
+            first_start,
+            first_end,
+            second_start,
+            second_end,
+        }
+    }
+}
+
+pub trait Sequence: Eq + Hash {}
+impl<T: Eq + Hash> Sequence for T {}
+
+pub struct SequenceMatcher<'a, T: 'a + Sequence> {
+    first_sequence: &'a [T],
+    second_sequence: &'a [T],
+    matching_blocks: Option<Vec<Match>>,
+    opcodes: Option<Vec<Opcode>>,
+    is_junk: Option<fn(&T) -> bool>,
+    second_sequence_elements: HashMap<&'a T, Vec<usize>>,
+}
+
+impl<'a, T: Sequence> SequenceMatcher<'a, T> {
+    pub fn new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T>
+    where
+        S: AsRef<[T]> + ?Sized,
+    {
+        let mut matcher = SequenceMatcher {
+            first_sequence: first_sequence.as_ref(),
+            second_sequence: second_sequence.as_ref(),
+            matching_blocks: None,
+            opcodes: None,
+            is_junk: None,
+            second_sequence_elements: HashMap::new(),
+        };
+        matcher.set_seqs(first_sequence, second_sequence);
+        matcher
+    }
+
+    pub fn set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>) {
+        self.is_junk = is_junk;
+        self.matching_blocks = None;
+        self.opcodes = None;
+        self.chain_second_seq();
+    }
+
+    pub fn set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S)
+    where
+        S: AsRef<[T]> + ?Sized,
+    {
+        self.set_first_seq(first_sequence);
+        self.set_second_seq(second_sequence);
+    }
+
+    pub fn set_first_seq<S>(&mut self, sequence: &'a S)
+    where
+        S: AsRef<[T]> + ?Sized,
+    {
+        self.first_sequence = sequence.as_ref();
+        self.matching_blocks = None;
+        self.opcodes = None;
+    }
+
+    pub fn set_second_seq<S>(&mut self, sequence: &'a S)
+    where
+        S: AsRef<[T]> + ?Sized,
+    {
+        self.second_sequence = sequence.as_ref();
+        self.matching_blocks = None;
+        self.opcodes = None;
+        self.chain_second_seq();
+    }
+
+    fn chain_second_seq(&mut self) {
+        let second_sequence = self.second_sequence;
+        let mut second_sequence_elements = HashMap::new();
+        for (i, item) in second_sequence.iter().enumerate() {
+            let mut counter = second_sequence_elements
+                .entry(item)
+                .or_insert_with(Vec::new);
+            counter.push(i);
+        }
+        if let Some(junk_func) = self.is_junk {
+            second_sequence_elements = second_sequence_elements
+                .into_iter()
+                .filter(|&(element, _)| !junk_func(element))
+                .collect();
+        }
+        // Filter out popular elements
+        let len = second_sequence.len();
+        if len >= 200 {
+            let test_len = (len as f32 / 100.0).floor() as usize + 1;
+            second_sequence_elements = second_sequence_elements
+                .into_iter()
+                .filter(|&(_, ref indexes)| indexes.len() > test_len)
+                .collect();
+        }
+        self.second_sequence_elements = second_sequence_elements;
+    }
+
+    pub fn find_longest_match(
+        &self,
+        first_start: usize,
+        first_end: usize,
+        second_start: usize,
+        second_end: usize,
+    ) -> Match {
+        let first_sequence = &self.first_sequence;
+        let second_sequence = &self.second_sequence;
+        let second_sequence_elements = &self.second_sequence_elements;
+        let (mut best_i, mut best_j, mut best_size) = (first_start, second_start, 0);
+        let mut j2len: HashMap<usize, usize> = HashMap::new();
+        for (i, item) in first_sequence
+            .iter()
+            .enumerate()
+            .take(first_end)
+            .skip(first_start)
+        {
+            let mut new_j2len: HashMap<usize, usize> = HashMap::new();
+            if let Some(indexes) = second_sequence_elements.get(item) {
+                for j in indexes {
+                    let j = *j;
+                    if j < second_start {
+                        continue;
+                    };
+                    if j >= second_end {
+                        break;
+                    };
+                    let mut size = 0;
+                    if j > 0 {
+                        if let Some(k) = j2len.get(&(j - 1)) {
+                            size = *k;
+                        }
+                    }
+                    size += 1;
+                    new_j2len.insert(j, size);
+                    if size > best_size {
+                        best_i = i + 1 - size;
+                        best_j = j + 1 - size;
+                        best_size = size;
+                    }
+                }
+            }
+            j2len = new_j2len;
+        }
+        for _ in 0..2 {
+            while best_i > first_start
+                && best_j > second_start
+                && first_sequence.get(best_i - 1) == second_sequence.get(best_j - 1)
+            {
+                best_i -= 1;
+                best_j -= 1;
+                best_size += 1;
+            }
+            while best_i + best_size < first_end
+                && best_j + best_size < second_end
+                && first_sequence.get(best_i + best_size) == second_sequence.get(best_j + best_size)
+            {
+                best_size += 1;
+            }
+        }
+        Match::new(best_i, best_j, best_size)
+    }
+
+    pub fn get_matching_blocks(&mut self) -> Vec<Match> {
+        if self.matching_blocks.as_ref().is_some() {
+            return self.matching_blocks.as_ref().unwrap().clone();
+        }
+        let (first_length, second_length) = (self.first_sequence.len(), self.second_sequence.len());
+        let mut matches = Vec::new();
+        let mut queue = vec![(0, first_length, 0, second_length)];
+        while !queue.is_empty() {
+            let (first_start, first_end, second_start, second_end) = queue.pop().unwrap();
+            let m = self.find_longest_match(first_start, first_end, second_start, second_end);
+            match m.size {
+                0 => {}
+                _ => {
+                    if first_start < m.first_start && second_start < m.second_start {
+                        queue.push((first_start, m.first_start, second_start, m.second_start));
+                    }
+                    if m.first_start + m.size < first_end && m.second_start + m.size < second_end {
+                        queue.push((
+                            m.first_start + m.size,
+                            first_end,
+                            m.second_start + m.size,
+                            second_end,
+                        ));
+                    }
+                    matches.push(m);
+                }
+            }
+        }
+        matches.sort_by(|a, b| a.cmp(b));
+        let (mut first_start, mut second_start, mut size) = (0, 0, 0);
+        let mut non_adjacent = Vec::new();
+        for m in &matches {
+            if first_start + size == m.first_start && second_start + size == m.second_start {
+                size += m.size
+            } else {
+                if size != 0 {
+                    non_adjacent.push(Match::new(first_start, second_start, size));
+                }
+                first_start = m.first_start;
+                second_start = m.second_start;
+                size = m.size;
+            }
+        }
+        if size != 0 {
+            non_adjacent.push(Match::new(first_start, second_start, size));
+        }
+        non_adjacent.push(Match::new(first_length, second_length, 0));
+        self.matching_blocks = Some(non_adjacent);
+        self.matching_blocks.as_ref().unwrap().clone()
+    }
+
+    pub fn get_opcodes(&mut self) -> Vec<Opcode> {
+        if self.opcodes.as_ref().is_some() {
+            return self.opcodes.as_ref().unwrap().clone();
+        }
+        let mut opcodes = Vec::new();
+        let (mut i, mut j) = (0, 0);
+        for m in self.get_matching_blocks() {
+            let mut tag = String::new();
+            if i < m.first_start && j < m.second_start {
+                tag = String::from("replace");
+            } else if i < m.first_start {
+                tag = String::from("delete");
+            } else if j < m.second_start {
+                tag = String::from("insert");
+            }
+            if !tag.is_empty() {
+                opcodes.push(Opcode::new(tag, i, m.first_start, j, m.second_start));
+            }
+            i = m.first_start + m.size;
+            j = m.second_start + m.size;
+            if m.size != 0 {
+                opcodes.push(Opcode::new(
+                    String::from("equal"),
+                    m.first_start,
+                    i,
+                    m.second_start,
+                    j,
+                ));
+            }
+        }
+        self.opcodes = Some(opcodes);
+        self.opcodes.as_ref().unwrap().clone()
+    }
+
+    pub fn get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>> {
+        let mut res = Vec::new();
+        let mut codes = self.get_opcodes();
+        if codes.is_empty() {
+            codes.push(Opcode::new("equal".to_string(), 0, 1, 0, 1));
+        }
+
+        if codes.first().unwrap().tag == "equal" {
+            let opcode = codes.first_mut().unwrap();
+            opcode.first_start = max(opcode.first_start, opcode.first_end.saturating_sub(n));
+            opcode.second_start = max(opcode.second_start, opcode.second_end.saturating_sub(n));
+        }
+        if codes.last().unwrap().tag == "equal" {
+            let opcode = codes.last_mut().unwrap();
+            opcode.first_end = min(opcode.first_start + n, opcode.first_end);
+            opcode.second_end = min(opcode.second_start + n, opcode.second_end);
+        }
+        let nn = n + n;
+        let mut group = Vec::new();
+        for code in &codes {
+            let (mut first_start, mut second_start) = (code.first_start, code.second_start);
+            if code.tag == "equal" && code.first_end - code.first_start > nn {
+                group.push(Opcode::new(
+                    code.tag.clone(),
+                    code.first_start,
+                    min(code.first_end, code.first_start + n),
+                    code.second_start,
+                    min(code.second_end, code.second_start + n),
+                ));
+                res.push(group.clone());
+                group.clear();
+                first_start = max(first_start, code.first_end.saturating_sub(n));
+                second_start = max(second_start, code.second_end.saturating_sub(n));
+            }
+            group.push(Opcode::new(
+                code.tag.clone(),
+                first_start,
+                code.first_end,
+                second_start,
+                code.second_end,
+            ));
+        }
+        if !(group.len() == 1 && group.first().unwrap().tag == "equal") || group.is_empty() {
+            res.push(group.clone());
+        }
+        res
+    }
+
+    pub fn ratio(&mut self) -> f32 {
+        let matches = self.get_matching_blocks()
+            .iter()
+            .fold(0, |res, &m| res + m.size);
+        calculate_ratio(
+            matches,
+            self.first_sequence.len() + self.second_sequence.len(),
+        )
+    }
+}
diff --git a/src/utils.rs b/src/utils.rs

new file mode 100644 (file)

index 0000000..b711bf8
--- /dev/null
+++ b/src/utils.rs
@@ -0,0 +1,47 @@
+pub fn calculate_ratio(matches: usize, length: usize) -> f32 {
+    if length != 0 {
+        return 2.0 * matches as f32 / length as f32;
+    }
+    1.0
+}
+
+pub fn str_with_similar_chars(c: char, length: usize) -> String {
+    let mut s = String::new();
+    for _ in 0..length {
+        s.push_str(&c.to_string());
+    }
+    s
+}
+
+pub fn count_leading(line: &str, c: char) -> usize {
+    let (mut i, n) = (0, line.len());
+    let line: Vec<char> = line.chars().collect();
+    while (i < n) && line[i] == c {
+        i += 1;
+    }
+    i
+}
+
+pub fn format_range_unified(start: usize, end: usize) -> String {
+    let mut beginning = start + 1;
+    let length = end - start;
+    if length == 1 {
+        return beginning.to_string();
+    }
+    if length == 0 {
+        beginning -= 1;
+    }
+    format!("{},{}", beginning, length)
+}
+
+pub fn format_range_context(start: usize, end: usize) -> String {
+    let mut beginning = start + 1;
+    let length = end - start;
+    if length == 0 {
+        beginning -= 1
+    }
+    if length <= 1 {
+        return beginning.to_string();
+    }
+    format!("{},{}", beginning, beginning + length - 1)
+}
diff --git a/tests/tests.rs b/tests/tests.rs

new file mode 100644 (file)

index 0000000..8f49de9
--- /dev/null
+++ b/tests/tests.rs
@@ -0,0 +1,194 @@
+extern crate difflib;
+
+use difflib::differ::Differ;
+use difflib::sequencematcher::{Match, Opcode, SequenceMatcher};
+
+#[test]
+fn test_longest_match() {
+    let matcher = SequenceMatcher::new(" abcd", "abcd abcd");
+    let m = matcher.find_longest_match(0, 5, 0, 9);
+    assert_eq!(m.first_start, 0);
+    assert_eq!(m.second_start, 4);
+    assert_eq!(m.size, 5);
+}
+
+#[test]
+fn test_all_matches() {
+    let mut matcher = SequenceMatcher::new("abxcd", "abcd");
+    let result = matcher.get_matching_blocks();
+    let mut expected_result = Vec::new();
+    expected_result.push(Match {
+        first_start: 0,
+        second_start: 0,
+        size: 2,
+    });
+    expected_result.push(Match {
+        first_start: 3,
+        second_start: 2,
+        size: 2,
+    });
+    expected_result.push(Match {
+        first_start: 5,
+        second_start: 4,
+        size: 0,
+    });
+    assert_eq!(result, expected_result);
+}
+
+#[test]
+fn test_get_opcodes() {
+    let mut matcher = SequenceMatcher::new("qabxcd", "abycdf");
+    let result = matcher.get_opcodes();
+    let mut expected_result = Vec::new();
+    expected_result.push(Opcode {
+        tag: "delete".to_string(),
+        first_start: 0,
+        first_end: 1,
+        second_start: 0,
+        second_end: 0,
+    });
+    expected_result.push(Opcode {
+        tag: "equal".to_string(),
+        first_start: 1,
+        first_end: 3,
+        second_start: 0,
+        second_end: 2,
+    });
+    expected_result.push(Opcode {
+        tag: "replace".to_string(),
+        first_start: 3,
+        first_end: 4,
+        second_start: 2,
+        second_end: 3,
+    });
+    expected_result.push(Opcode {
+        tag: "equal".to_string(),
+        first_start: 4,
+        first_end: 6,
+        second_start: 3,
+        second_end: 5,
+    });
+    expected_result.push(Opcode {
+        tag: "insert".to_string(),
+        first_start: 6,
+        first_end: 6,
+        second_start: 5,
+        second_end: 6,
+    });
+    assert_eq!(result, expected_result);
+}
+
+#[test]
+fn test_ratio() {
+    let mut matcher = SequenceMatcher::new("abcd", "bcde");
+    assert_eq!(matcher.ratio(), 0.75);
+}
+
+#[test]
+fn test_get_close_matches() {
+    let words = vec!["ape", "apple", "peach", "puppy"];
+    let result = difflib::get_close_matches("appel", words, 3, 0.6);
+    assert_eq!(result, vec!["apple", "ape"]);
+}
+
+#[test]
+fn test_differ_compare() {
+    let first_text = vec!["one\n", "two\n", "three\n"];
+    let second_text = vec!["ore\n", "tree\n", "emu\n"];
+    let differ = Differ::new();
+    let result = differ.compare(&first_text, &second_text).join("");
+    assert_eq!(
+        result,
+        "- one\n?  ^\n+ ore\n?  ^\n- two\n- three\n?  -\n+ tree\n+ emu\n"
+    );
+}
+
+fn is_junk_char(ch: &char) -> bool {
+    if *ch == ' ' || *ch == '\t' {
+        return true;
+    }
+    false
+}
+
+#[test]
+fn test_differ_compare_with_func() {
+    let first_text = vec!["one\n", "two\n", "three\n"];
+    let second_text = vec!["ore\n", "tree\n", "emu\n"];
+    let mut differ = Differ::new();
+    differ.char_junk = Some(is_junk_char);
+    let result = differ.compare(&first_text, &second_text).join("");
+    assert_eq!(
+        result,
+        "- one\n?  ^\n+ ore\n?  ^\n- two\n- three\n?  -\n+ tree\n+ emu\n"
+    );
+}
+
+#[test]
+fn test_differ_restore() {
+    let first_text = vec!["one\n", "  two\n", "three\n"];
+    let second_text = vec!["ore\n", "tree\n", "emu\n"];
+    let differ = Differ::new();
+    let diff = differ.compare(&first_text, &second_text);
+    assert_eq!(first_text, Differ::restore(&diff, 1));
+    assert_eq!(second_text, Differ::restore(&diff, 2));
+}
+
+#[test]
+fn test_unified_diff() {
+    let first_text = "one two three four".split(" ").collect::<Vec<&str>>();
+    let second_text = "zero one tree four".split(" ").collect::<Vec<&str>>();
+    let result = difflib::unified_diff(
+        &first_text,
+        &second_text,
+        "Original",
+        "Current",
+        "2005-01-26 23:30:50",
+        "2010-04-02 10:20:52",
+        3,
+    ).join("");
+    assert_eq!(
+        result,
+        "--- Original\t2005-01-26 23:30:50\n+++ Current\t2010-04-02 10:20:52\n@@ -1,4 \
+         +1,4 @@\n+zero one-two-three+tree four"
+    );
+}
+
+#[test]
+fn test_context_diff() {
+    let first_text = "one two three four".split(" ").collect::<Vec<&str>>();
+    let second_text = "zero one tree four".split(" ").collect::<Vec<&str>>();
+    let result = difflib::context_diff(
+        &first_text,
+        &second_text,
+        "Original",
+        "Current",
+        "2005-01-26 23:30:50",
+        "2010-04-02 10:20:52",
+        3,
+    ).join("");
+    assert_eq!(
+        result,
+        "*** Original\t2005-01-26 23:30:50\n--- Current\t2010-04-02 \
+         10:20:52\n***************\n*** 1,4 ****\n  one! two! three  four--- 1,4 ----\n+ \
+         zero  one! tree  four"
+    );
+}
+
+#[test]
+fn test_integer_slice() {
+    let s1 = vec![1, 2, 3, 4, 5];
+    let s2 = vec![5, 4, 3, 2, 1];
+    let result = SequenceMatcher::new(&s1, &s2).get_matching_blocks();
+    let mut expected_result = Vec::new();
+    expected_result.push(Match {
+        first_start: 0,
+        second_start: 4,
+        size: 1,
+    });
+    expected_result.push(Match {
+        first_start: 5,
+        second_start: 5,
+        size: 0,
+    });
+    assert_eq!(result, expected_result);
+}
author	DongHun Kwak <dh0128.kwak@samsung.com>
	Tue, 7 Nov 2023 07:24:54 +0000 (16:24 +0900)
committer	DongHun Kwak <dh0128.kwak@samsung.com>
	Tue, 7 Nov 2023 07:24:54 +0000 (16:24 +0900)
Cargo.toml	[new file with mode: 0644]	patch \| blob
Cargo.toml.orig	[new file with mode: 0644]	patch \| blob
examples/example.rs	[new file with mode: 0644]	patch \| blob
src/differ.rs	[new file with mode: 0644]	patch \| blob
src/lib.rs	[new file with mode: 0644]	patch \| blob
src/sequencematcher.rs	[new file with mode: 0644]	patch \| blob
src/utils.rs	[new file with mode: 0644]	patch \| blob
tests/tests.rs	[new file with mode: 0644]	patch \| blob