Import dissimilar 1.0.6

author Woohyun Jung <wh0705.jung@samsung.com>

Tue, 14 Mar 2023 09:28:29 +0000 (18:28 +0900)

committer Woohyun Jung <wh0705.jung@samsung.com>

Tue, 14 Mar 2023 09:28:29 +0000 (18:28 +0900)
author Woohyun Jung <wh0705.jung@samsung.com>
Tue, 14 Mar 2023 09:28:29 +0000 (18:28 +0900)
committer Woohyun Jung <wh0705.jung@samsung.com>
Tue, 14 Mar 2023 09:28:29 +0000 (18:28 +0900)
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json

new file mode 100644 (file)

index 0000000..77399d2
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,6 @@
+{
+  "git": {
+    "sha1": "00360c4b4b2735c5e1cf21e18af8aca3d28eb5ef"
+  },
+  "path_in_vcs": ""
+}
+\ No newline at end of file
diff --git a/.clippy.toml b/.clippy.toml

new file mode 100644 (file)

index 0000000..0a54853
--- /dev/null
+++ b/.clippy.toml
@@ -0,0 +1 @@
+msrv = "1.36.0"
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml

new file mode 100644 (file)

index 0000000..7507077
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1 @@
+github: dtolnay
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml

new file mode 100644 (file)

index 0000000..540c84a
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,81 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+  schedule: [cron: "40 1 * * *"]
+
+permissions:
+  contents: read
+
+env:
+  RUSTFLAGS: -Dwarnings
+
+jobs:
+  pre_ci:
+    uses: dtolnay/.github/.github/workflows/pre_ci.yml@master
+
+  test:
+    name: Rust ${{matrix.rust}}
+    needs: pre_ci
+    if: needs.pre_ci.outputs.continue
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        rust: [nightly, beta, stable]
+    timeout-minutes: 45
+    steps:
+      - uses: actions/checkout@v3
+      - uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{matrix.rust}}
+      - run: cargo test
+      - run: cargo test --benches --release
+        if: matrix.rust == 'nightly'
+
+  msrv:
+    name: Rust 1.36.0
+    needs: pre_ci
+    if: needs.pre_ci.outputs.continue
+    runs-on: ubuntu-latest
+    timeout-minutes: 45
+    steps:
+      - uses: actions/checkout@v3
+      - uses: dtolnay/rust-toolchain@1.36.0
+      - run: cargo check
+
+  fuzz:
+    name: Fuzz
+    needs: pre_ci
+    if: needs.pre_ci.outputs.continue
+    runs-on: ubuntu-latest
+    timeout-minutes: 45
+    steps:
+      - uses: actions/checkout@v3
+      - uses: dtolnay/rust-toolchain@nightly
+      - uses: dtolnay/install@cargo-fuzz
+      - run: cargo fuzz check
+
+  clippy:
+    name: Clippy
+    runs-on: ubuntu-latest
+    if: github.event_name != 'pull_request'
+    timeout-minutes: 45
+    steps:
+      - uses: actions/checkout@v3
+      - uses: dtolnay/rust-toolchain@clippy
+      - run: cargo clippy --tests --benches -- -Dclippy::all -Dclippy::pedantic
+
+  miri:
+    name: Miri
+    needs: pre_ci
+    if: needs.pre_ci.outputs.continue
+    runs-on: ubuntu-latest
+    timeout-minutes: 45
+    steps:
+      - uses: actions/checkout@v3
+      - uses: dtolnay/rust-toolchain@miri
+      - run: cargo miri test
+        env:
+          MIRIFLAGS: -Zmiri-strict-provenance
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..96ef6c0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml

new file mode 100644 (file)

index 0000000..3fdb995
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,36 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2018"
+rust-version = "1.36"
+name = "dissimilar"
+version = "1.0.6"
+authors = ["David Tolnay <dtolnay@gmail.com>"]
+description = "Diff library with semantic cleanup, based on Google's diff-match-patch"
+documentation = "https://docs.rs/dissimilar"
+readme = "README.md"
+keywords = ["diff"]
+categories = [
+    "algorithms",
+    "text-processing",
+]
+license = "Apache-2.0"
+repository = "https://github.com/dtolnay/dissimilar"
+
+[package.metadata.docs.rs]
+targets = ["x86_64-unknown-linux-gnu"]
+
+[lib]
+doc-scrape-examples = false
+
+[dev-dependencies.once_cell]
+version = "1"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig

new file mode 100644 (file)

index 0000000..f69c79e
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,21 @@
+[package]
+name = "dissimilar"
+version = "1.0.6"
+authors = ["David Tolnay <dtolnay@gmail.com>"]
+categories = ["algorithms", "text-processing"]
+description = "Diff library with semantic cleanup, based on Google's diff-match-patch"
+documentation = "https://docs.rs/dissimilar"
+edition = "2018"
+keywords = ["diff"]
+license = "Apache-2.0" # See the readme. The whole crate is Apache licensed. Some parts are additionally MIT licensed.
+repository = "https://github.com/dtolnay/dissimilar"
+rust-version = "1.36"
+
+[lib]
+doc-scrape-examples = false
+
+[dev-dependencies]
+once_cell = "1"
+
+[package.metadata.docs.rs]
+targets = ["x86_64-unknown-linux-gnu"]
diff --git a/LICENSE-APACHE b/LICENSE-APACHE

new file mode 100644 (file)

index 0000000..1b5ec8b
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,176 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
diff --git a/LICENSE-MIT b/LICENSE-MIT

new file mode 100644 (file)

index 0000000..31aa793
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,23 @@
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md

new file mode 100644 (file)

index 0000000..82ce669
--- /dev/null
+++ b/README.md
@@ -0,0 +1,64 @@
+Dissimilar: diff library with semantic cleanup
+==============================================
+
+[<img alt="github" src="https://img.shields.io/badge/github-dtolnay/dissimilar-8da0cb?style=for-the-badge&labelColor=555555&logo=github" height="20">](https://github.com/dtolnay/dissimilar)
+[<img alt="crates.io" src="https://img.shields.io/crates/v/dissimilar.svg?style=for-the-badge&color=fc8d62&logo=rust" height="20">](https://crates.io/crates/dissimilar)
+[<img alt="docs.rs" src="https://img.shields.io/badge/docs.rs-dissimilar-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs" height="20">](https://docs.rs/dissimilar)
+[<img alt="build status" src="https://img.shields.io/github/actions/workflow/status/dtolnay/dissimilar/ci.yml?branch=master&style=for-the-badge" height="20">](https://github.com/dtolnay/dissimilar/actions?query=branch%3Amaster)
+
+This library is a port of the Diff component of [Diff Match Patch] to Rust. The
+diff implementation is based on [Myers' diff algorithm] but includes some
+[semantic cleanups] to increase human readability by factoring out commonalities
+which are likely to be coincidental.
+
+Diff Match Patch was originally built in 2006 to power Google Docs.
+
+[Diff Match Patch]: https://github.com/google/diff-match-patch
+[Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
+[semantic cleanups]: https://neil.fraser.name/writing/diff/
+
+```toml
+[dependencies]
+dissimilar = "1.0"
+```
+
+*Compiler support: requires rustc 1.36+*
+
+<br>
+
+## Interface
+
+Here is the entire API of the Rust implementation. It operates on borrowed
+strings and the return value of the diff algorithm is a vector of chunks
+pointing into slices of those input strings.
+
+```rust
+pub enum Chunk<'a> {
+    Equal(&'a str),
+    Delete(&'a str),
+    Insert(&'a str),
+}
+
+pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
+```
+
+<br>
+
+## License
+
+The diff algorithm in this crate was ported to Rust using the Java and C++
+implementations found at <https://github.com/google/diff-match-patch> as
+reference, and is made available here under the <a href="LICENSE-APACHE">Apache
+License, Version 2.0</a> matching the license of the original. This entire
+project, including some parts unmodified from upstream and the Rust-specific
+modifications introduced in the course of porting the implementation, are
+distributed under this Apache license.
+
+Intellectual property that is unique to the Rust implementation is additionally
+made available to you dually under the <a href="LICENSE-MIT">MIT license</a>, if
+you prefer. This applies to all design choices and implementation choices not
+found in the upstream repo.
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in this crate by you, as defined in the Apache-2.0 license, shall
+be dual Apache and MIT licensed, without any additional terms or conditions.
diff --git a/benches/bench.rs b/benches/bench.rs

new file mode 100644 (file)

index 0000000..72f2633
--- /dev/null
+++ b/benches/bench.rs
@@ -0,0 +1,15 @@
+#![feature(test)]
+
+extern crate test;
+
+use dissimilar::diff;
+use std::{fs, io};
+use test::Bencher;
+
+#[bench]
+fn bench(b: &mut Bencher) -> io::Result<()> {
+    let document1 = fs::read_to_string("benches/document1.txt")?;
+    let document2 = fs::read_to_string("benches/document2.txt")?;
+    b.iter(|| diff(&document1, &document2));
+    Ok(())
+}
diff --git a/benches/document1.txt b/benches/document1.txt

new file mode 100644 (file)

index 0000000..54b438f
--- /dev/null
+++ b/benches/document1.txt
@@ -0,0 +1,230 @@
+This is a '''list of newspapers published by [[Journal Register Company]]'''.
+
+The company owns daily and weekly newspapers, other print media properties and newspaper-affiliated local Websites in the [[U.S.]] states of [[Connecticut]], [[Michigan]], [[New York]], [[Ohio]] and [[Pennsylvania]], organized in six geographic "clusters":<ref>[http://www.journalregister.com/newspapers.html Journal Register Company: Our Newspapers], accessed February 10, 2008.</ref>
+
+== Capital-Saratoga ==
+Three dailies, associated weeklies and [[pennysaver]]s in greater [[Albany, New York]]; also [http://www.capitalcentral.com capitalcentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com].
+
+* ''The Oneida Daily Dispatch'' {{WS|oneidadispatch.com}} of [[Oneida, New York]]
+* ''[[The Record (Troy)|The Record]]'' {{WS|troyrecord.com}} of [[Troy, New York]]
+* ''[[The Saratogian]]'' {{WS|saratogian.com}} of [[Saratoga Springs, New York]]
+* Weeklies:
+** ''Community News'' {{WS|cnweekly.com}} weekly of [[Clifton Park, New York]]
+** ''Rome Observer'' of [[Rome, New York]]
+** ''Life & Times of Utica'' of [[Utica, New York]]
+
+== Connecticut ==
+Five dailies, associated weeklies and [[pennysaver]]s in the state of [[Connecticut]]; also [http://www.ctcentral.com CTcentral.com], [http://www.ctcarsandtrucks.com CTCarsAndTrucks.com] and [http://www.jobsinct.com JobsInCT.com].
+
+* ''The Middletown Press'' {{WS|middletownpress.com}} of [[Middletown, Connecticut|Middletown]]
+* ''[[New Haven Register]]'' {{WS|newhavenregister.com}} of [[New Haven, Connecticut|New Haven]]
+* ''The Register Citizen'' {{WS|registercitizen.com}} of [[Torrington, Connecticut|Torrington]]
+
+* [[New Haven Register#Competitors|Elm City Newspapers]] {{WS|ctcentral.com}}
+** ''The Advertiser'' of [[East Haven, Connecticut|East Haven]]
+** ''Hamden Chronicle'' of [[Hamden, Connecticut|Hamden]]
+** ''Milford Weekly'' of [[Milford, Connecticut|Milford]]
+** ''The Orange Bulletin'' of [[Orange, Connecticut|Orange]]
+** ''The Post'' of [[North Haven, Connecticut|North Haven]]
+** ''Shelton Weekly'' of [[Shelton, Connecticut|Shelton]]
+** ''The Stratford Bard'' of [[Stratford, Connecticut|Stratford]]
+** ''Wallingford Voice'' of [[Wallingford, Connecticut|Wallingford]]
+** ''West Haven News'' of [[West Haven, Connecticut|West Haven]]
+* Housatonic Publications 
+** ''The New Milford Times'' {{WS|newmilfordtimes.com}} of [[New Milford, Connecticut|New Milford]]
+** ''The Brookfield Journal'' of [[Brookfield, Connecticut|Brookfield]]
+** ''The Kent Good Times Dispatch'' of [[Kent, Connecticut|Kent]]
+** ''The Bethel Beacon'' of [[Bethel, Connecticut|Bethel]]
+** ''The Litchfield Enquirer'' of [[Litchfield, Connecticut|Litchfield]]
+** ''Litchfield County Times'' of [[Litchfield, Connecticut|Litchfield]]
+* Imprint Newspapers {{WS|imprintnewspapers.com}}
+** ''West Hartford News'' of [[West Hartford, Connecticut|West Hartford]]
+** ''Windsor Journal'' of [[Windsor, Connecticut|Windsor]]
+** ''Windsor Locks Journal'' of [[Windsor Locks, Connecticut|Windsor Locks]]
+** ''Avon Post'' of [[Avon, Connecticut|Avon]]
+** ''Farmington Post'' of [[Farmington, Connecticut|Farmington]]
+** ''Simsbury Post'' of [[Simsbury, Connecticut|Simsbury]]
+** ''Tri-Town Post'' of [[Burlington, Connecticut|Burlington]], [[Canton, Connecticut|Canton]] and [[Harwinton, Connecticut|Harwinton]]
+* Minuteman Publications
+** ''[[Fairfield Minuteman]]'' of [[Fairfield, Connecticut|Fairfield]]
+** ''The Westport Minuteman'' {{WS|westportminuteman.com}} of [[Westport, Connecticut|Westport]]
+* Shoreline Newspapers weeklies:
+** ''Branford Review'' of [[Branford, Connecticut|Branford]]
+** ''Clinton Recorder'' of [[Clinton, Connecticut|Clinton]]
+** ''The Dolphin'' of [[Naval Submarine Base New London]] in [[New London, Connecticut|New London]]
+** ''Main Street News'' {{WS|ctmainstreetnews.com}} of [[Essex, Connecticut|Essex]]
+** ''Pictorial Gazette'' of [[Old Saybrook, Connecticut|Old Saybrook]]
+** ''Regional Express'' of [[Colchester, Connecticut|Colchester]]
+** ''Regional Standard'' of [[Colchester, Connecticut|Colchester]]
+** ''Shoreline Times'' {{WS|shorelinetimes.com}} of [[Guilford, Connecticut|Guilford]]
+** ''Shore View East'' of [[Madison, Connecticut|Madison]]
+** ''Shore View West'' of [[Guilford, Connecticut|Guilford]]
+* Other weeklies:
+** ''Registro'' {{WS|registroct.com}} of [[New Haven, Connecticut|New Haven]]
+** ''Thomaston Express'' {{WS|thomastownexpress.com}} of [[Thomaston, Connecticut|Thomaston]]
+** ''Foothills Traders'' {{WS|foothillstrader.com}} of Torrington, Bristol, Canton
+
+== Michigan ==
+Four dailies, associated weeklies and [[pennysaver]]s in the state of [[Michigan]]; also [http://www.micentralhomes.com MIcentralhomes.com] and [http://www.micentralautos.com MIcentralautos.com]
+* ''[[Oakland Press]]'' {{WS|theoaklandpress.com}} of [[Oakland, Michigan|Oakland]]
+* ''Daily Tribune'' {{WS|dailytribune.com}} of [[Royal Oak, Michigan|Royal Oak]]
+* ''Macomb Daily'' {{WS|macombdaily.com}} of [[Mt. Clemens, Michigan|Mt. Clemens]]
+* ''[[Morning Sun]]'' {{WS|themorningsun.com}} of  [[Mount Pleasant, Michigan|Mount Pleasant]]
+* Heritage Newspapers {{WS|heritage.com}}
+** ''Belleville View''
+** ''Ile Camera''
+** ''Monroe Guardian''
+** ''Ypsilanti Courier''
+** ''News-Herald''
+** ''Press & Guide''
+** ''Chelsea Standard & Dexter Leader''
+** ''Manchester Enterprise''
+** ''Milan News-Leader''
+** ''Saline Reporter''
+* Independent Newspapers {{WS|sourcenewspapers.com}}
+** ''Advisor''
+** ''Source''
+* Morning Star {{WS|morningstarpublishing.com}}
+** ''Alma Reminder''
+** ''Alpena Star''
+** ''Antrim County News''
+** ''Carson City Reminder''
+** ''The Leader & Kalkaskian''
+** ''Ogemaw/Oscoda County Star''
+** ''Petoskey/Charlevoix Star''
+** ''Presque Isle Star''
+** ''Preview Community Weekly''
+** ''Roscommon County Star''
+** ''St. Johns Reminder''
+** ''Straits Area Star''
+** ''The (Edmore) Advertiser'' 
+* Voice Newspapers {{WS|voicenews.com}}
+** ''Armada Times''
+** ''Bay Voice''
+** ''Blue Water Voice''
+** ''Downriver Voice''
+** ''Macomb Township Voice''
+** ''North Macomb Voice''
+** ''Weekend Voice''
+** ''Suburban Lifestyles'' {{WS|suburbanlifestyles.com}}
+
+== Mid-Hudson ==
+One daily, associated magazines in the [[Hudson River Valley]] of [[New York]]; also [http://www.midhudsoncentral.com MidHudsonCentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com].
+
+* ''[[Daily Freeman]]'' {{WS|dailyfreeman.com}} of [[Kingston, New York]]
+
+== Ohio ==
+Two dailies, associated magazines and three shared Websites, all in the state of [[Ohio]]: [http://www.allaroundcleveland.com AllAroundCleveland.com], [http://www.allaroundclevelandcars.com AllAroundClevelandCars.com] and [http://www.allaroundclevelandjobs.com AllAroundClevelandJobs.com].
+
+* ''[[The News-Herald (Ohio)|The News-Herald]]'' {{WS|news-herald.com}} of [[Willoughby, Ohio|Willoughby]]
+* ''[[The Morning Journal]]'' {{WS|morningjournal.com}} of [[Lorain, Ohio|Lorain]]
+
+== Philadelphia area ==
+Seven dailies and associated weeklies and magazines in [[Pennsylvania]] and [[New Jersey]], and associated Websites: [http://www.allaroundphilly.com AllAroundPhilly.com], [http://www.jobsinnj.com JobsInNJ.com], [http://www.jobsinpa.com JobsInPA.com], and [http://www.phillycarsearch.com PhillyCarSearch.com].
+
+* ''The Daily Local'' {{WS|dailylocal.com}} of [[West Chester, Pennsylvania|West Chester]]
+* ''[[Delaware County Daily and Sunday Times]] {{WS|delcotimes.com}} of Primos
+* ''[[The Mercury (Pennsylvania)|The Mercury]]'' {{WS|pottstownmercury.com}} of [[Pottstown, Pennsylvania|Pottstown]]
+* ''The Phoenix'' {{WS|phoenixvillenews.com}} of [[Phoenixville, Pennsylvania|Phoenixville]]
+* ''[[The Reporter (Lansdale)|The Reporter]]'' {{WS|thereporteronline.com}} of [[Lansdale, Pennsylvania|Lansdale]]
+* ''The Times Herald'' {{WS|timesherald.com}} of [[Norristown, Pennsylvania|Norristown]]
+* ''[[The Trentonian]]'' {{WS|trentonian.com}} of [[Trenton, New Jersey]]
+
+* Weeklies
+** ''El Latino Expreso'' of [[Trenton, New Jersey]]
+** ''La Voz'' of [[Norristown, Pennsylvania]]
+** ''The Village News'' of [[Downingtown, Pennsylvania]]
+** ''The Times Record'' of [[Kennett Square, Pennsylvania]]
+** ''The Tri-County Record'' {{WS|tricountyrecord.com}} of [[Morgantown, Pennsylvania]]
+** ''News of Delaware County'' {{WS|newsofdelawarecounty.com}}of [[Havertown, Pennsylvania]]
+** ''Main Line Times'' {{WS|mainlinetimes.com}}of [[Ardmore, Pennsylvania]]
+** ''Penny Pincher'' of [[Pottstown, Pennsylvania]]
+** ''Town Talk'' {{WS|towntalknews.com}} of [[Ridley, Pennsylvania]]
+* Chesapeake Publishing {{WS|pa8newsgroup.com}} 
+** ''Solanco Sun Ledger'' of [[Quarryville, Pennsylvania]]
+** ''Columbia Ledger'' of [[Columbia, Pennsylvania]]
+** ''Coatesville Ledger'' of [[Downingtown, Pennsylvania]]
+** ''Parkesburg Post Ledger'' of [[Quarryville, Pennsylvania]]
+** ''Downingtown Ledger'' of [[Downingtown, Pennsylvania]]
+** ''The Kennett Paper'' of [[Kennett Square, Pennsylvania]]
+** ''Avon Grove Sun'' of [[West Grove, Pennsylvania]]
+** ''Oxford Tribune'' of [[Oxford, Pennsylvania]]
+** ''Elizabethtown Chronicle'' of [[Elizabethtown, Pennsylvania]]
+** ''Donegal Ledger'' of [[Donegal, Pennsylvania]]
+** ''Chadds Ford Post'' of [[Chadds Ford, Pennsylvania]]
+** ''The Central Record'' of [[Medford, New Jersey]]
+** ''Maple Shade Progress'' of [[Maple Shade, New Jersey]]
+* Intercounty Newspapers {{WS|buckslocalnews.com}} 
+** ''The Review'' of Roxborough, Pennsylvania
+** ''The Recorder'' of [[Conshohocken, Pennsylvania]]
+** ''The Leader'' of [[Mount Airy, Pennsylvania|Mount Airy]] and West Oak Lake, Pennsylvania
+** ''The Pennington Post'' of [[Pennington, New Jersey]]
+** ''The Bristol Pilot'' of [[Bristol, Pennsylvania]]
+** ''Yardley News'' of [[Yardley, Pennsylvania]]
+** ''New Hope Gazette'' of [[New Hope, Pennsylvania]]
+** ''Doylestown Patriot'' of [[Doylestown, Pennsylvania]]
+** ''Newtown Advance'' of [[Newtown, Pennsylvania]]
+** ''The Plain Dealer'' of [[Williamstown, New Jersey]]
+** ''News Report'' of [[Sewell, New Jersey]]
+** ''Record Breeze'' of [[Berlin, New Jersey]]
+** ''Newsweekly'' of [[Moorestown, New Jersey]]
+** ''Haddon Herald'' of [[Haddonfield, New Jersey]]
+** ''New Egypt Press'' of [[New Egypt, New Jersey]]
+** ''Community News'' of [[Pemberton, New Jersey]]
+** ''Plymouth Meeting Journal'' of [[Plymouth Meeting, Pennsylvania]]
+** ''Lafayette Hill Journal'' of [[Lafayette Hill, Pennsylvania]]
+* Montgomery Newspapers {{WS|montgomerynews.com}} 
+** ''Ambler Gazette'' of [[Ambler, Pennsylvania]]
+** ''Central Bucks Life'' of [[Bucks County, Pennsylvania]]
+** ''The Colonial'' of [[Plymouth Meeting, Pennsylvania]]
+** ''Glenside News'' of [[Glenside, Pennsylvania]]
+** ''The Globe'' of [[Lower Moreland Township, Pennsylvania]]
+** ''Main Line Life'' of [[Ardmore, Pennsylvania]]
+** ''Montgomery Life'' of [[Fort Washington, Pennsylvania]]
+** ''North Penn Life'' of [[Lansdale, Pennsylvania]]
+** ''Perkasie News Herald'' of [[Perkasie, Pennsylvania]]
+** ''Public Spirit'' of [[Hatboro, Pennsylvania]]
+** ''Souderton Independent'' of [[Souderton, Pennsylvania]]
+** ''Springfield Sun'' of [[Springfield, Pennsylvania]]
+** ''Spring-Ford Reporter'' of [[Royersford, Pennsylvania]]
+** ''Times Chronicle'' of [[Jenkintown, Pennsylvania]]
+** ''Valley Item'' of [[Perkiomenville, Pennsylvania]]
+** ''Willow Grove Guide'' of [[Willow Grove, Pennsylvania]]
+* News Gleaner Publications (closed December 2008) {{WS|newsgleaner.com}} 
+** ''Life Newspapers'' of [[Philadelphia, Pennsylvania]]
+* Suburban Publications
+** ''The Suburban & Wayne Times'' {{WS|waynesuburban.com}} of [[Wayne, Pennsylvania]]
+** ''The Suburban Advertiser'' of [[Exton, Pennsylvania]]
+** ''The King of Prussia Courier'' of [[King of Prussia, Pennsylvania]]
+* Press Newspapers {{WS|countypressonline.com}} 
+** ''County Press'' of [[Newtown Square, Pennsylvania]]
+** ''Garnet Valley Press'' of [[Glen Mills, Pennsylvania]]
+** ''Haverford Press'' of [[Newtown Square, Pennsylvania]] (closed January 2009)
+** ''Hometown Press'' of [[Glen Mills, Pennsylvania]] (closed January 2009)
+** ''Media Press'' of [[Newtown Square, Pennsylvania]] (closed January 2009)
+** ''Springfield Press'' of [[Springfield, Pennsylvania]]
+* Berks-Mont Newspapers {{WS|berksmontnews.com}} 
+** ''The Boyertown Area Times'' of [[Boyertown, Pennsylvania]]
+** ''The Kutztown Area Patriot'' of [[Kutztown, Pennsylvania]]
+** ''The Hamburg Area Item'' of [[Hamburg, Pennsylvania]]
+** ''The Southern Berks News'' of [[Exeter Township, Berks County, Pennsylvania]]
+** ''The Free Press'' of [[Quakertown, Pennsylvania]]
+** ''The Saucon News'' of [[Quakertown, Pennsylvania]]
+** ''Westside Weekly'' of [[Reading, Pennsylvania]]
+
+* Magazines
+** ''Bucks Co. Town & Country Living''
+** ''Chester Co. Town & Country Living''
+** ''Montomgery Co. Town & Country Living''
+** ''Garden State Town & Country Living''
+** ''Montgomery Homes''
+** ''Philadelphia Golfer''
+** ''Parents Express''
+** ''Art Matters''
+
+{{JRC}}
+
+==References==
+<references />
+
+[[Category:Journal Register publications|*]]
diff --git a/benches/document2.txt b/benches/document2.txt

new file mode 100644 (file)

index 0000000..8f25a80
--- /dev/null
+++ b/benches/document2.txt
@@ -0,0 +1,188 @@
+This is a '''list of newspapers published by [[Journal Register Company]]'''.
+
+The company owns daily and weekly newspapers, other print media properties and newspaper-affiliated local Websites in the [[U.S.]] states of [[Connecticut]], [[Michigan]], [[New York]], [[Ohio]], [[Pennsylvania]] and [[New Jersey]], organized in six geographic "clusters":<ref>[http://www.journalregister.com/publications.html Journal Register Company: Our Publications], accessed April 21, 2010.</ref>
+
+== Capital-Saratoga ==
+Three dailies, associated weeklies and [[pennysaver]]s in greater [[Albany, New York]]; also [http://www.capitalcentral.com capitalcentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com].
+
+* ''The Oneida Daily Dispatch'' {{WS|oneidadispatch.com}} of [[Oneida, New York]]
+* ''[[The Record (Troy)|The Record]]'' {{WS|troyrecord.com}} of [[Troy, New York]]
+* ''[[The Saratogian]]'' {{WS|saratogian.com}} of [[Saratoga Springs, New York]]
+* Weeklies:
+** ''Community News'' {{WS|cnweekly.com}} weekly of [[Clifton Park, New York]]
+** ''Rome Observer'' {{WS|romeobserver.com}} of [[Rome, New York]]
+** ''WG Life '' {{WS|saratogian.com/wglife/}} of [[Wilton, New York]]
+** ''Ballston Spa Life '' {{WS|saratogian.com/bspalife}} of [[Ballston Spa, New York]]
+** ''Greenbush Life'' {{WS|troyrecord.com/greenbush}} of [[Troy, New York]]
+** ''Latham Life'' {{WS|troyrecord.com/latham}} of [[Latham, New York]]
+** ''River Life'' {{WS|troyrecord.com/river}} of [[Troy, New York]]
+
+== Connecticut ==
+Three dailies, associated weeklies and [[pennysaver]]s in the state of [[Connecticut]]; also [http://www.ctcentral.com CTcentral.com], [http://www.ctcarsandtrucks.com CTCarsAndTrucks.com] and [http://www.jobsinct.com JobsInCT.com].
+
+* ''The Middletown Press'' {{WS|middletownpress.com}} of [[Middletown, Connecticut|Middletown]]
+* ''[[New Haven Register]]'' {{WS|newhavenregister.com}} of [[New Haven, Connecticut|New Haven]]
+* ''The Register Citizen'' {{WS|registercitizen.com}} of [[Torrington, Connecticut|Torrington]]
+
+* Housatonic Publications 
+** ''The Housatonic Times'' {{WS|housatonictimes.com}} of [[New Milford, Connecticut|New Milford]]
+** ''Litchfield County Times'' {{WS|countytimes.com}} of [[Litchfield, Connecticut|Litchfield]]
+
+* Minuteman Publications
+** ''[[Fairfield Minuteman]]'' {{WS|fairfieldminuteman.com}}of [[Fairfield, Connecticut|Fairfield]]
+** ''The Westport Minuteman'' {{WS|westportminuteman.com}} of [[Westport, Connecticut|Westport]]
+
+* Shoreline Newspapers 
+** ''The Dolphin'' {{WS|dolphin-news.com}} of [[Naval Submarine Base New London]] in [[New London, Connecticut|New London]]
+** ''Shoreline Times'' {{WS|shorelinetimes.com}} of [[Guilford, Connecticut|Guilford]]
+
+* Foothills Media Group {{WS|foothillsmediagroup.com}}
+** ''Thomaston Express'' {{WS|thomastonexpress.com}} of [[Thomaston, Connecticut|Thomaston]]
+** ''Good News About Torrington'' {{WS|goodnewsabouttorrington.com}} of [[Torrington, Connecticut|Torrington]]
+** ''Granby News'' {{WS|foothillsmediagroup.com/granby}} of [[Granby, Connecticut|Granby]]
+** ''Canton News'' {{WS|foothillsmediagroup.com/canton}} of [[Canton, Connecticut|Canton]]
+** ''Avon News'' {{WS|foothillsmediagroup.com/avon}} of [[Avon, Connecticut|Avon]]
+** ''Simsbury News'' {{WS|foothillsmediagroup.com/simsbury}} of [[Simsbury, Connecticut|Simsbury]]
+** ''Litchfield News'' {{WS|foothillsmediagroup.com/litchfield}} of [[Litchfield, Connecticut|Litchfield]]
+** ''Foothills Trader'' {{WS|foothillstrader.com}} of Torrington, Bristol, Canton
+
+* Other weeklies
+** ''The Milford-Orange Bulletin'' {{WS|ctbulletin.com}} of [[Orange, Connecticut|Orange]]
+** ''The Post-Chronicle'' {{WS|ctpostchronicle.com}} of [[North Haven, Connecticut|North Haven]]
+** ''West Hartford News'' {{WS|westhartfordnews.com}} of [[West Hartford, Connecticut|West Hartford]]
+
+* Magazines
+** ''The Connecticut Bride'' {{WS|connecticutmag.com}}
+** ''Connecticut Magazine'' {{WS|theconnecticutbride.com}}
+** ''Passport Magazine'' {{WS|passport-mag.com}}
+
+== Michigan ==
+Four dailies, associated weeklies and [[pennysaver]]s in the state of [[Michigan]]; also [http://www.micentralhomes.com MIcentralhomes.com] and [http://www.micentralautos.com MIcentralautos.com]
+* ''[[Oakland Press]]'' {{WS|theoaklandpress.com}} of [[Oakland, Michigan|Oakland]]
+* ''Daily Tribune'' {{WS|dailytribune.com}} of [[Royal Oak, Michigan|Royal Oak]]
+* ''Macomb Daily'' {{WS|macombdaily.com}} of [[Mt. Clemens, Michigan|Mt. Clemens]]
+* ''[[Morning Sun]]'' {{WS|themorningsun.com}} of  [[Mount Pleasant, Michigan|Mount Pleasant]]
+
+* Heritage Newspapers {{WS|heritage.com}}
+** ''Belleville View'' {{WS|bellevilleview.com}}
+** ''Ile Camera'' {{WS|thenewsherald.com/ile_camera}}
+** ''Monroe Guardian''  {{WS|monreguardian.com}}
+** ''Ypsilanti Courier'' {{WS|ypsilanticourier.com}}
+** ''News-Herald'' {{WS|thenewsherald.com}}
+** ''Press & Guide'' {{WS|pressandguide.com}}
+** ''Chelsea Standard & Dexter Leader'' {{WS|chelseastandard.com}}
+** ''Manchester Enterprise'' {{WS|manchesterguardian.com}}
+** ''Milan News-Leader'' {{WS|milannews.com}}
+** ''Saline Reporter'' {{WS|salinereporter.com}}
+* Independent Newspapers 
+** ''Advisor'' {{WS|sourcenewspapers.com}}
+** ''Source'' {{WS|sourcenewspapers.com}}
+* Morning Star {{WS|morningstarpublishing.com}}
+** ''The Leader & Kalkaskian'' {{WS|leaderandkalkaskian.com}}
+** ''Grand Traverse Insider'' {{WS|grandtraverseinsider.com}}
+** ''Alma Reminder''
+** ''Alpena Star''
+** ''Ogemaw/Oscoda County Star''
+** ''Presque Isle Star''
+** ''St. Johns Reminder''
+
+* Voice Newspapers {{WS|voicenews.com}}
+** ''Armada Times''
+** ''Bay Voice''
+** ''Blue Water Voice''
+** ''Downriver Voice''
+** ''Macomb Township Voice''
+** ''North Macomb Voice''
+** ''Weekend Voice''
+
+== Mid-Hudson ==
+One daily, associated magazines in the [[Hudson River Valley]] of [[New York]]; also [http://www.midhudsoncentral.com MidHudsonCentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com].
+
+* ''[[Daily Freeman]]'' {{WS|dailyfreeman.com}} of [[Kingston, New York]]
+* ''Las Noticias'' {{WS|lasnoticiasny.com}} of [[Kingston, New York]]
+
+== Ohio ==
+Two dailies, associated magazines and three shared Websites, all in the state of [[Ohio]]: [http://www.allaroundcleveland.com AllAroundCleveland.com], [http://www.allaroundclevelandcars.com AllAroundClevelandCars.com] and [http://www.allaroundclevelandjobs.com AllAroundClevelandJobs.com].
+
+* ''[[The News-Herald (Ohio)|The News-Herald]]'' {{WS|news-herald.com}} of [[Willoughby, Ohio|Willoughby]]
+* ''[[The Morning Journal]]'' {{WS|morningjournal.com}} of [[Lorain, Ohio|Lorain]]
+* ''El Latino Expreso'' {{WS|lorainlatino.com}} of [[Lorain, Ohio|Lorain]]
+
+== Philadelphia area ==
+Seven dailies and associated weeklies and magazines in [[Pennsylvania]] and [[New Jersey]], and associated Websites: [http://www.allaroundphilly.com AllAroundPhilly.com], [http://www.jobsinnj.com JobsInNJ.com], [http://www.jobsinpa.com JobsInPA.com], and [http://www.phillycarsearch.com PhillyCarSearch.com].
+
+* ''[[The Daily Local News]]'' {{WS|dailylocal.com}} of [[West Chester, Pennsylvania|West Chester]]
+* ''[[Delaware County Daily and Sunday Times]] {{WS|delcotimes.com}} of Primos [[Upper Darby Township, Pennsylvania]]
+* ''[[The Mercury (Pennsylvania)|The Mercury]]'' {{WS|pottstownmercury.com}} of [[Pottstown, Pennsylvania|Pottstown]]
+* ''[[The Reporter (Lansdale)|The Reporter]]'' {{WS|thereporteronline.com}} of [[Lansdale, Pennsylvania|Lansdale]]
+* ''The Times Herald'' {{WS|timesherald.com}} of [[Norristown, Pennsylvania|Norristown]]
+* ''[[The Trentonian]]'' {{WS|trentonian.com}} of [[Trenton, New Jersey]]
+
+* Weeklies
+* ''The Phoenix'' {{WS|phoenixvillenews.com}} of [[Phoenixville, Pennsylvania]]
+** ''El Latino Expreso'' {{WS|njexpreso.com}} of [[Trenton, New Jersey]]
+** ''La Voz'' {{WS|lavozpa.com}} of [[Norristown, Pennsylvania]]
+** ''The Tri County Record'' {{WS|tricountyrecord.com}} of [[Morgantown, Pennsylvania]]
+** ''Penny Pincher'' {{WS|pennypincherpa.com}}of [[Pottstown, Pennsylvania]]
+
+* Chesapeake Publishing  {{WS|southernchestercountyweeklies.com}}
+** ''The Kennett Paper'' {{WS|kennettpaper.com}} of [[Kennett Square, Pennsylvania]]
+** ''Avon Grove Sun'' {{WS|avongrovesun.com}} of [[West Grove, Pennsylvania]]
+** ''The Central Record'' {{WS|medfordcentralrecord.com}} of [[Medford, New Jersey]]
+** ''Maple Shade Progress'' {{WS|mapleshadeprogress.com}} of [[Maple Shade, New Jersey]]
+
+* Intercounty Newspapers {{WS|buckslocalnews.com}} {{WS|southjerseylocalnews.com}} 
+** ''The Pennington Post'' {{WS|penningtonpost.com}} of [[Pennington, New Jersey]]
+** ''The Bristol Pilot'' {{WS|bristolpilot.com}} of [[Bristol, Pennsylvania]]
+** ''Yardley News'' {{WS|yardleynews.com}} of [[Yardley, Pennsylvania]]
+** ''Advance of Bucks County'' {{WS|advanceofbucks.com}} of [[Newtown, Pennsylvania]]
+** ''Record Breeze'' {{WS|recordbreeze.com}} of [[Berlin, New Jersey]]
+** ''Community News'' {{WS|sjcommunitynews.com}} of [[Pemberton, New Jersey]]
+
+* Montgomery Newspapers {{WS|montgomerynews.com}} 
+** ''Ambler Gazette'' {{WS|amblergazette.com}} of [[Ambler, Pennsylvania]]
+** ''The Colonial'' {{WS|colonialnews.com}} of [[Plymouth Meeting, Pennsylvania]]
+** ''Glenside News'' {{WS|glensidenews.com}} of [[Glenside, Pennsylvania]]
+** ''The Globe'' {{WS|globenewspaper.com}} of [[Lower Moreland Township, Pennsylvania]]
+** ''Montgomery Life'' {{WS|montgomerylife.com}} of [[Fort Washington, Pennsylvania]]
+** ''North Penn Life'' {{WS|northpennlife.com}} of [[Lansdale, Pennsylvania]]
+** ''Perkasie News Herald'' {{WS|perkasienewsherald.com}} of [[Perkasie, Pennsylvania]]
+** ''Public Spirit'' {{WS|thepublicspirit.com}} of [[Hatboro, Pennsylvania]]
+** ''Souderton Independent'' {{WS|soudertonindependent.com}} of [[Souderton, Pennsylvania]]
+** ''Springfield Sun'' {{WS|springfieldsun.com}} of [[Springfield, Pennsylvania]]
+** ''Spring-Ford Reporter'' {{WS|springfordreporter.com}} of [[Royersford, Pennsylvania]]
+** ''Times Chronicle'' {{WS|thetimeschronicle.com}} of [[Jenkintown, Pennsylvania]]
+** ''Valley Item'' {{WS|valleyitem.com}} of [[Perkiomenville, Pennsylvania]]
+** ''Willow Grove Guide'' {{WS|willowgroveguide.com}} of [[Willow Grove, Pennsylvania]]
+** ''The Review'' {{WS|roxreview.com}} of [[Roxborough, Philadelphia, Pennsylvania]]
+
+* Main Line Media News {{WS|mainlinemedianews.com}}
+** ''Main Line Times'' {{WS|mainlinetimes.com}} of [[Ardmore, Pennsylvania]]
+** ''Main Line Life'' {{WS|mainlinelife.com}} of [[Ardmore, Pennsylvania]]
+** ''The King of Prussia Courier'' {{WS|kingofprussiacourier.com}} of [[King of Prussia, Pennsylvania]]
+
+* Delaware County News Network {{WS|delconewsnetwork.com}} 
+** ''News of Delaware County'' {{WS|newsofdelawarecounty.com}} of [[Havertown, Pennsylvania]]
+** ''County Press'' {{WS|countypressonline.com}} of [[Newtown Square, Pennsylvania]]
+** ''Garnet Valley Press'' {{WS|countypressonline.com}} of [[Glen Mills, Pennsylvania]]
+** ''Springfield Press'' {{WS|countypressonline.com}} of [[Springfield, Pennsylvania]]
+** ''Town Talk'' {{WS|towntalknews.com}} of [[Ridley, Pennsylvania]]
+
+* Berks-Mont Newspapers {{WS|berksmontnews.com}} 
+** ''The Boyertown Area Times'' {{WS|berksmontnews.com/boyertown_area_times}} of [[Boyertown, Pennsylvania]]
+** ''The Kutztown Area Patriot'' {{WS|berksmontnews.com/kutztown_area_patriot}} of [[Kutztown, Pennsylvania]]
+** ''The Hamburg Area Item'' {{WS|berksmontnews.com/hamburg_area_item}} of [[Hamburg, Pennsylvania]]
+** ''The Southern Berks News'' {{WS|berksmontnews.com/southern_berks_news}} of [[Exeter Township, Berks County, Pennsylvania]]
+** ''Community Connection'' {{WS|berksmontnews.com/community_connection}} of [[Boyertown, Pennsylvania]]
+
+* Magazines
+** ''Bucks Co. Town & Country Living'' {{WS|buckscountymagazine.com}} 
+** ''Parents Express'' {{WS|parents-express.com}} 
+** ''Real Men, Rednecks'' {{WS|realmenredneck.com}} 
+
+{{JRC}}
+
+==References==
+<references />
+
+[[Category:Journal Register publications|*]]
diff --git a/src/find.rs b/src/find.rs

new file mode 100644 (file)

index 0000000..4af3b8b
--- /dev/null
+++ b/src/find.rs
@@ -0,0 +1,232 @@
+// The strstr implementation in this file is extracted from the Rust standard
+// library's str::find. The algorithm works for arbitrary &[T] haystack and
+// needle but is only exposed by the standard library on UTF-8 strings.
+//
+// https://github.com/rust-lang/rust/blob/1.40.0/src/libcore/str/pattern.rs
+//
+// ---
+//
+// This is the Two-Way search algorithm, which was introduced in the paper:
+// Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
+//
+// Here's some background information.
+//
+// A *word* is a string of symbols. The *length* of a word should be a familiar
+// notion, and here we denote it for any word x by |x|. (We also allow for the
+// possibility of the *empty word*, a word of length zero.)
+//
+// If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be
+// a *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] ==
+// x[i+p]. For example, both 1 and 2 are periods for the string "aa". As another
+// example, the only period of the string "abcd" is 4.
+//
+// We denote by period(x) the *smallest* period of x (provided that x is
+// non-empty). This is always well-defined since every non-empty word x has at
+// least one period, |x|. We sometimes call this *the period* of x.
+//
+// If u, v and x are words such that x = uv, where uv is the concatenation of u
+// and v, then we say that (u, v) is a *factorization* of x.
+//
+// Let (u, v) be a factorization for a word x. Then if w is a non-empty word
+// such that both of the following hold
+//
+//   - either w is a suffix of u or u is a suffix of w
+//   - either w is a prefix of v or v is a prefix of w
+//
+// then w is said to be a *repetition* for the factorization (u, v).
+//
+// Just to unpack this, there are four possibilities here. Let w = "abc". Then
+// we might have:
+//
+//   - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
+//   - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
+//   - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
+//   - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
+//
+// Note that the word vu is a repetition for any factorization (u,v) of x = uv,
+// so every factorization has at least one repetition.
+//
+// If x is a string and (u, v) is a factorization for x, then a *local period*
+// for (u, v) is an integer r such that there is some word w such that |w| = r
+// and w is a repetition for (u, v).
+//
+// We denote by local_period(u, v) the smallest local period of (u, v). We
+// sometimes call this *the local period* of (u, v). Provided that x = uv is
+// non-empty, this is well-defined (because each non-empty word has at least one
+// factorization, as noted above).
+//
+// It can be proven that the following is an equivalent definition of a local
+// period for a factorization (u, v): any positive integer r such that x[i] ==
+// x[i+r] for all i such that |u| - r <= i <= |u| - 1 and such that both x[i]
+// and x[i+r] are defined. (i.e., i > 0 and i + r < |x|).
+//
+// Using the above reformulation, it is easy to prove that
+//
+//     1 <= local_period(u, v) <= period(uv)
+//
+// A factorization (u, v) of x such that local_period(u,v) = period(x) is called
+// a *critical factorization*.
+//
+// The algorithm hinges on the following theorem, which is stated without proof:
+//
+// **Critical Factorization Theorem** Any word x has at least one critical
+// factorization (u, v) such that |u| < period(x).
+//
+// The purpose of maximal_suffix is to find such a critical factorization.
+//
+// If the period is short, compute another factorization x = u' v' to use for
+// reverse search, chosen instead so that |v'| < period(x).
+
+use std::cmp;
+use std::usize;
+
+pub fn find(haystack: &[char], needle: &[char]) -> Option<usize> {
+    assert!(!needle.is_empty());
+
+    // crit_pos: critical factorization index
+    let (crit_pos_false, period_false) = maximal_suffix(needle, false);
+    let (crit_pos_true, period_true) = maximal_suffix(needle, true);
+    let (crit_pos, mut period) = if crit_pos_false > crit_pos_true {
+        (crit_pos_false, period_false)
+    } else {
+        (crit_pos_true, period_true)
+    };
+
+    // Byteset is an extension (not part of the two way algorithm); it is a
+    // 64-bit "fingerprint" where each set bit j corresponds to a (byte & 63) ==
+    // j present in the needle.
+    let byteset;
+    // Index into needle before which we have already matched.
+    let mut memory;
+
+    // A particularly readable explanation of what's going on here can be found
+    // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
+    // see the code for "Algorithm CP" on p. 323.
+    //
+    // What's going on is we have some critical factorization (u, v) of the
+    // needle, and we want to determine whether u is a suffix of &v[..period].
+    // If it is, we use "Algorithm CP1". Otherwise we use "Algorithm CP2", which
+    // is optimized for when the period of the needle is large.
+    let long_period = needle[..crit_pos] != needle[period..period + crit_pos];
+    if long_period {
+        // Long period case -- we have an approximation to the actual period,
+        // and don't use memorization.
+        //
+        // Approximate the period by lower bound max(|u|, |v|) + 1.
+        period = cmp::max(crit_pos, needle.len() - crit_pos) + 1;
+        byteset = byteset_create(needle);
+        // Dummy value to signify that the period is long.
+        memory = usize::MAX;
+    } else {
+        // Short period case -- the period is exact.
+        byteset = byteset_create(&needle[..period]);
+        memory = 0;
+    }
+
+    // One of the main ideas of Two-Way is that we factorize the needle into two
+    // halves, (u, v), and begin trying to find v in the haystack by scanning
+    // left to right. If v matches, we try to match u by scanning right to left.
+    // How far we can jump when we encounter a mismatch is all based on the fact
+    // that (u, v) is a critical factorization for the needle.
+    let mut position = 0;
+    let needle_last = needle.len() - 1;
+    'search: loop {
+        // Check that we have room to search in. position + needle_last cannot
+        // overflow if we assume slices are bounded by isize's range.
+        let tail_byte = *haystack.get(position + needle_last)?;
+
+        // Quickly skip by large portions unrelated to our substring.
+        if !byteset_contains(byteset, tail_byte) {
+            position += needle.len();
+            if !long_period {
+                memory = 0;
+            }
+            continue 'search;
+        }
+
+        // See if the right part of the needle matches.
+        let start = if long_period {
+            crit_pos
+        } else {
+            cmp::max(crit_pos, memory)
+        };
+        for i in start..needle.len() {
+            if needle[i] != haystack[position + i] {
+                position += i - crit_pos + 1;
+                if !long_period {
+                    memory = 0;
+                }
+                continue 'search;
+            }
+        }
+
+        // See if the left part of the needle matches.
+        let start = if long_period { 0 } else { memory };
+        for i in (start..crit_pos).rev() {
+            if needle[i] != haystack[position + i] {
+                position += period;
+                if !long_period {
+                    memory = needle.len() - period;
+                }
+                continue 'search;
+            }
+        }
+
+        // We have found a match!
+        return Some(position);
+    }
+}
+
+fn byteset_create(chars: &[char]) -> u64 {
+    chars.iter().fold(0, |a, &ch| (1 << (ch as u8 & 0x3f)) | a)
+}
+
+fn byteset_contains(byteset: u64, ch: char) -> bool {
+    (byteset >> ((ch as u8 & 0x3f) as usize)) & 1 != 0
+}
+
+// Compute the maximal suffix of `arr`.
+//
+// The maximal suffix is a possible critical factorization (u, v) of `arr`.
+//
+// Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
+// period of v.
+//
+// `order_greater` determines if lexical order is `<` or `>`. Both
+// orders must be computed -- the ordering with the largest `i` gives
+// a critical factorization.
+//
+// For long period cases, the resulting period is not exact (it is too short).
+fn maximal_suffix(arr: &[char], order_greater: bool) -> (usize, usize) {
+    let mut left = 0; // Corresponds to i in the paper
+    let mut right = 1; // Corresponds to j in the paper
+    let mut offset = 0; // Corresponds to k in the paper, but starting at 0
+                        // to match 0-based indexing.
+    let mut period = 1; // Corresponds to p in the paper
+
+    while let Some(&a) = arr.get(right + offset) {
+        // `left` will be inbounds when `right` is.
+        let b = arr[left + offset];
+        if (a < b && !order_greater) || (a > b && order_greater) {
+            // Suffix is smaller, period is entire prefix so far.
+            right += offset + 1;
+            offset = 0;
+            period = right - left;
+        } else if a == b {
+            // Advance through repetition of the current period.
+            if offset + 1 == period {
+                right += offset + 1;
+                offset = 0;
+            } else {
+                offset += 1;
+            }
+        } else {
+            // Suffix is larger, start over from current location.
+            left = right;
+            right += 1;
+            offset = 0;
+            period = 1;
+        }
+    }
+    (left, period)
+}
diff --git a/src/lib.rs b/src/lib.rs

new file mode 100644 (file)

index 0000000..b66434a
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,935 @@
+//! [![github]](https://github.com/dtolnay/dissimilar)&ensp;[![crates-io]](https://crates.io/crates/dissimilar)&ensp;[![docs-rs]](https://docs.rs/dissimilar)
+//!
+//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
+//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
+//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
+//!
+//! <br>
+//!
+//! ## Diff library with semantic cleanup, based on Google's diff-match-patch
+//!
+//! This library is a port of the Diff component of [Diff Match Patch] to Rust.
+//! The diff implementation is based on [Myers' diff algorithm] but includes
+//! some [semantic cleanups] to increase human readability by factoring out
+//! commonalities which are likely to be coincidental.
+//!
+//! Diff Match Patch was originally built in 2006 to power Google Docs.
+//!
+//! # Interface
+//!
+//! Here is the entire API of the Rust implementation. It operates on borrowed
+//! strings and the return value of the diff algorithm is a vector of chunks
+//! pointing into slices of those input strings.
+//!
+//! ```
+//! pub enum Chunk<'a> {
+//!     Equal(&'a str),
+//!     Delete(&'a str),
+//!     Insert(&'a str),
+//! }
+//!
+//! # const IGNORE: &str = stringify! {
+//! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
+//! # };
+//! ```
+//!
+//! [Diff Match Patch]: https://github.com/google/diff-match-patch
+//! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
+//! [semantic cleanups]: https://neil.fraser.name/writing/diff/
+
+#![doc(html_root_url = "https://docs.rs/dissimilar/1.0.6")]
+#![allow(
+    clippy::blocks_in_if_conditions,
+    clippy::bool_to_int_with_if,
+    clippy::cast_possible_wrap,
+    clippy::cast_sign_loss,
+    clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127
+    clippy::collapsible_else_if,
+    clippy::comparison_chain,
+    clippy::match_same_arms,
+    clippy::module_name_repetitions,
+    clippy::must_use_candidate,
+    clippy::new_without_default,
+    clippy::octal_escapes,
+    clippy::shadow_unrelated,
+    clippy::similar_names,
+    clippy::too_many_lines,
+    clippy::unseparated_literal_suffix,
+    unused_parens, // false positive on Some(&(mut diff)) pattern
+)]
+
+mod find;
+mod range;
+
+#[cfg(test)]
+mod tests;
+
+use crate::range::{slice, Range};
+use std::cmp;
+use std::collections::VecDeque;
+use std::fmt::{self, Debug, Display, Write};
+
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub enum Chunk<'a> {
+    Equal(&'a str),
+    Delete(&'a str),
+    Insert(&'a str),
+}
+
+#[derive(Copy, Clone)]
+enum Diff<'a, 'b> {
+    Equal(Range<'a>, Range<'b>),
+    Delete(Range<'a>),
+    Insert(Range<'b>),
+}
+
+impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> {
+    fn text(&self) -> Range<'tmp> {
+        match *self {
+            Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range,
+        }
+    }
+
+    fn grow_left(&mut self, increment: usize) {
+        self.for_each(|range| {
+            range.offset -= increment;
+            range.len += increment;
+        });
+    }
+
+    fn grow_right(&mut self, increment: usize) {
+        self.for_each(|range| range.len += increment);
+    }
+
+    fn shift_left(&mut self, increment: usize) {
+        self.for_each(|range| range.offset -= increment);
+    }
+
+    fn shift_right(&mut self, increment: usize) {
+        self.for_each(|range| range.offset += increment);
+    }
+
+    fn for_each(&mut self, f: impl Fn(&mut Range)) {
+        match self {
+            Diff::Equal(range1, range2) => {
+                f(range1);
+                f(range2);
+            }
+            Diff::Delete(range) => f(range),
+            Diff::Insert(range) => f(range),
+        }
+    }
+}
+
+pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> {
+    let chars1: Vec<char> = text1.chars().collect();
+    let chars2: Vec<char> = text2.chars().collect();
+    let range1 = Range::new(&chars1, ..);
+    let range2 = Range::new(&chars2, ..);
+
+    let mut solution = main(range1, range2);
+    cleanup_char_boundary(&mut solution);
+    cleanup_semantic(&mut solution);
+    cleanup_merge(&mut solution);
+
+    let mut chunks = Vec::new();
+    let mut pos1 = 0;
+    let mut pos2 = 0;
+    for diff in solution.diffs {
+        chunks.push(match diff {
+            Diff::Equal(range, _) => {
+                let len = range.len_bytes();
+                let chunk = Chunk::Equal(&text1[pos1..pos1 + len]);
+                pos1 += len;
+                pos2 += len;
+                chunk
+            }
+            Diff::Delete(range) => {
+                let len = range.len_bytes();
+                let chunk = Chunk::Delete(&text1[pos1..pos1 + len]);
+                pos1 += len;
+                chunk
+            }
+            Diff::Insert(range) => {
+                let len = range.len_bytes();
+                let chunk = Chunk::Insert(&text2[pos2..pos2 + len]);
+                pos2 += len;
+                chunk
+            }
+        });
+    }
+    chunks
+}
+
+struct Solution<'a, 'b> {
+    text1: Range<'a>,
+    text2: Range<'b>,
+    diffs: Vec<Diff<'a, 'b>>,
+}
+
+fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> {
+    let whole1 = text1;
+    let whole2 = text2;
+
+    // Trim off common prefix.
+    let common_prefix_len = common_prefix(text1, text2);
+    let common_prefix = Diff::Equal(
+        text1.substring(..common_prefix_len),
+        text2.substring(..common_prefix_len),
+    );
+    text1 = text1.substring(common_prefix_len..);
+    text2 = text2.substring(common_prefix_len..);
+
+    // Trim off common suffix.
+    let common_suffix_len = common_suffix(text1, text2);
+    let common_suffix = Diff::Equal(
+        text1.substring(text1.len - common_suffix_len..),
+        text2.substring(text2.len - common_suffix_len..),
+    );
+    text1 = text1.substring(..text1.len - common_suffix_len);
+    text2 = text2.substring(..text2.len - common_suffix_len);
+
+    // Compute the diff on the middle block.
+    let mut solution = Solution {
+        text1: whole1,
+        text2: whole2,
+        diffs: compute(text1, text2),
+    };
+
+    // Restore the prefix and suffix.
+    if common_prefix_len > 0 {
+        solution.diffs.insert(0, common_prefix);
+    }
+    if common_suffix_len > 0 {
+        solution.diffs.push(common_suffix);
+    }
+
+    cleanup_merge(&mut solution);
+
+    solution
+}
+
+// Find the differences between two texts. Assumes that the texts do not have
+// any common prefix or suffix.
+fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
+    match (text1.is_empty(), text2.is_empty()) {
+        (true, true) => return Vec::new(),
+        (true, false) => return vec![Diff::Insert(text2)],
+        (false, true) => return vec![Diff::Delete(text1)],
+        (false, false) => {}
+    }
+
+    // Check for entire shorter text inside the longer text.
+    if text1.len > text2.len {
+        if let Some(i) = text1.find(text2) {
+            return vec![
+                Diff::Delete(text1.substring(..i)),
+                Diff::Equal(text1.substring(i..i + text2.len), text2),
+                Diff::Delete(text1.substring(i + text2.len..)),
+            ];
+        }
+    } else {
+        if let Some(i) = text2.find(text1) {
+            return vec![
+                Diff::Insert(text2.substring(..i)),
+                Diff::Equal(text1, text2.substring(i..i + text1.len)),
+                Diff::Insert(text2.substring(i + text1.len..)),
+            ];
+        }
+    }
+
+    if text1.len == 1 || text2.len == 1 {
+        // Single character string.
+        // After the previous check, the character can't be an equality.
+        return vec![Diff::Delete(text1), Diff::Insert(text2)];
+    }
+
+    bisect(text1, text2)
+}
+
+// Find the 'middle snake' of a diff, split the problem in two and return the
+// recursively constructed diff.
+//
+// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
+fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
+    let max_d = (text1.len + text2.len + 1) / 2;
+    let v_offset = max_d;
+    let v_len = 2 * max_d;
+    let mut v1 = vec![-1isize; v_len];
+    let mut v2 = vec![-1isize; v_len];
+    v1[v_offset + 1] = 0;
+    v2[v_offset + 1] = 0;
+    let delta = text1.len as isize - text2.len as isize;
+    // If the total number of characters is odd, then the front path will
+    // collide with the reverse path.
+    let front = delta % 2 != 0;
+    // Offsets for start and end of k loop.
+    // Prevents mapping of space beyond the grid.
+    let mut k1start = 0;
+    let mut k1end = 0;
+    let mut k2start = 0;
+    let mut k2end = 0;
+    for d in 0..max_d as isize {
+        // Walk the front path one step.
+        let mut k1 = -d + k1start;
+        while k1 <= d - k1end {
+            let k1_offset = (v_offset as isize + k1) as usize;
+            let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) {
+                v1[k1_offset + 1]
+            } else {
+                v1[k1_offset - 1] + 1
+            } as usize;
+            let mut y1 = (x1 as isize - k1) as usize;
+            if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) {
+                let advance = common_prefix(s1, s2);
+                x1 += advance;
+                y1 += advance;
+            }
+            v1[k1_offset] = x1 as isize;
+            if x1 > text1.len {
+                // Ran off the right of the graph.
+                k1end += 2;
+            } else if y1 > text2.len {
+                // Ran off the bottom of the graph.
+                k1start += 2;
+            } else if front {
+                let k2_offset = v_offset as isize + delta - k1;
+                if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 {
+                    // Mirror x2 onto top-left coordinate system.
+                    let x2 = text1.len as isize - v2[k2_offset as usize];
+                    if x1 as isize >= x2 {
+                        // Overlap detected.
+                        return bisect_split(text1, text2, x1, y1);
+                    }
+                }
+            }
+            k1 += 2;
+        }
+
+        // Walk the reverse path one step.
+        let mut k2 = -d + k2start;
+        while k2 <= d - k2end {
+            let k2_offset = (v_offset as isize + k2) as usize;
+            let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) {
+                v2[k2_offset + 1]
+            } else {
+                v2[k2_offset - 1] + 1
+            } as usize;
+            let mut y2 = (x2 as isize - k2) as usize;
+            if x2 < text1.len && y2 < text2.len {
+                let advance = common_suffix(
+                    text1.substring(..text1.len - x2),
+                    text2.substring(..text2.len - y2),
+                );
+                x2 += advance;
+                y2 += advance;
+            }
+            v2[k2_offset] = x2 as isize;
+            if x2 > text1.len {
+                // Ran off the left of the graph.
+                k2end += 2;
+            } else if y2 > text2.len {
+                // Ran off the top of the graph.
+                k2start += 2;
+            } else if !front {
+                let k1_offset = v_offset as isize + delta - k2;
+                if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 {
+                    let x1 = v1[k1_offset as usize] as usize;
+                    let y1 = v_offset + x1 - k1_offset as usize;
+                    // Mirror x2 onto top-left coordinate system.
+                    x2 = text1.len - x2;
+                    if x1 >= x2 {
+                        // Overlap detected.
+                        return bisect_split(text1, text2, x1, y1);
+                    }
+                }
+            }
+            k2 += 2;
+        }
+    }
+    // Number of diffs equals number of characters, no commonality at all.
+    vec![Diff::Delete(text1), Diff::Insert(text2)]
+}
+
+// Given the location of the 'middle snake', split the diff in two parts and
+// recurse.
+fn bisect_split<'a, 'b>(
+    text1: Range<'a>,
+    text2: Range<'b>,
+    x: usize,
+    y: usize,
+) -> Vec<Diff<'a, 'b>> {
+    let (text1a, text1b) = text1.split_at(x);
+    let (text2a, text2b) = text2.split_at(y);
+
+    // Compute both diffs serially.
+    let mut diffs = main(text1a, text2a).diffs;
+    diffs.extend(main(text1b, text2b).diffs);
+
+    diffs
+}
+
+// Determine the length of the common prefix of two strings.
+fn common_prefix(text1: Range, text2: Range) -> usize {
+    for (i, (b1, b2)) in text1.chars().zip(text2.chars()).enumerate() {
+        if b1 != b2 {
+            return i;
+        }
+    }
+    cmp::min(text1.len, text2.len)
+}
+
+// Determine the length of the common suffix of two strings.
+fn common_suffix(text1: Range, text2: Range) -> usize {
+    for (i, (b1, b2)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() {
+        if b1 != b2 {
+            return i;
+        }
+    }
+    cmp::min(text1.len, text2.len)
+}
+
+// Determine if the suffix of one string is the prefix of another.
+//
+// Returns the number of characters common to the end of the first string and
+// the start of the second string.
+fn common_overlap(mut text1: Range, mut text2: Range) -> usize {
+    // Eliminate the null case.
+    if text1.is_empty() || text2.is_empty() {
+        return 0;
+    }
+    // Truncate the longer string.
+    if text1.len > text2.len {
+        text1 = text1.substring(text1.len - text2.len..);
+    } else if text1.len < text2.len {
+        text2 = text2.substring(..text1.len);
+    }
+    // Quick check for the worst case.
+    if slice(text1) == slice(text2) {
+        return text1.len;
+    }
+
+    // Start by looking for a single character match
+    // and increase length until no match is found.
+    // Performance analysis: https://neil.fraser.name/news/2010/11/04/
+    let mut best = 0;
+    let mut length = 1;
+    loop {
+        let pattern = text1.substring(text1.len - length..);
+        let found = match text2.find(pattern) {
+            Some(found) => found,
+            None => return best,
+        };
+        length += found;
+        if found == 0
+            || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length))
+        {
+            best = length;
+            length += 1;
+        }
+    }
+}
+
+fn cleanup_char_boundary(solution: &mut Solution) {
+    fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool {
+        // FIXME: use unicode-segmentation crate?
+        let _ = doc;
+        let _ = pos;
+        true
+    }
+
+    fn boundary_down(doc: &[char], pos: usize) -> usize {
+        let mut adjust = 0;
+        while !is_segmentation_boundary(doc, pos - adjust) {
+            adjust += 1;
+        }
+        adjust
+    }
+
+    fn boundary_up(doc: &[char], pos: usize) -> usize {
+        let mut adjust = 0;
+        while !is_segmentation_boundary(doc, pos + adjust) {
+            adjust += 1;
+        }
+        adjust
+    }
+
+    fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) {
+        let prev_end = prev.offset + prev.len;
+        if prev_end > range.offset {
+            let delta = cmp::min(prev_end - range.offset, range.len);
+            range.offset += delta;
+            range.len -= delta;
+        }
+    }
+
+    let mut read = 0;
+    let mut retain = 0;
+    let mut last_delete = Range::empty();
+    let mut last_insert = Range::empty();
+    while let Some(&(mut diff)) = solution.diffs.get(read) {
+        read += 1;
+        match &mut diff {
+            Diff::Equal(range1, range2) => {
+                let adjust = boundary_up(range1.doc, range1.offset);
+                // If the whole range is sub-character, skip it.
+                if range1.len <= adjust {
+                    continue;
+                }
+                range1.offset += adjust;
+                range1.len -= adjust;
+                range2.offset += adjust;
+                range2.len -= adjust;
+                let adjust = boundary_down(range1.doc, range1.offset + range1.len);
+                range1.len -= adjust;
+                range2.len -= adjust;
+                last_delete = Range::empty();
+                last_insert = Range::empty();
+            }
+            Diff::Delete(range) => {
+                skip_overlap(&last_delete, range);
+                if range.len == 0 {
+                    continue;
+                }
+                let adjust = boundary_down(range.doc, range.offset);
+                range.offset -= adjust;
+                range.len += adjust;
+                let adjust = boundary_up(range.doc, range.offset + range.len);
+                range.len += adjust;
+                last_delete = *range;
+            }
+            Diff::Insert(range) => {
+                skip_overlap(&last_insert, range);
+                if range.len == 0 {
+                    continue;
+                }
+                let adjust = boundary_down(range.doc, range.offset);
+                range.offset -= adjust;
+                range.len += adjust;
+                let adjust = boundary_up(range.doc, range.offset + range.len);
+                range.len += adjust;
+                last_insert = *range;
+            }
+        }
+        solution.diffs[retain] = diff;
+        retain += 1;
+    }
+
+    solution.diffs.truncate(retain);
+}
+
+// Reduce the number of edits by eliminating semantically trivial equalities.
+fn cleanup_semantic(solution: &mut Solution) {
+    let mut diffs = &mut solution.diffs;
+    if diffs.is_empty() {
+        return;
+    }
+
+    let mut changes = false;
+    let mut equalities = VecDeque::new(); // Double-ended queue of equalities.
+    let mut last_equality = None; // Always equal to equalities.peek().text
+    let mut pointer = 0;
+    // Number of characters that changed prior to the equality.
+    let mut len_insertions1 = 0;
+    let mut len_deletions1 = 0;
+    // Number of characters that changed after the equality.
+    let mut len_insertions2 = 0;
+    let mut len_deletions2 = 0;
+    while let Some(&this_diff) = diffs.get(pointer) {
+        match this_diff {
+            Diff::Equal(text1, text2) => {
+                equalities.push_back(pointer);
+                len_insertions1 = len_insertions2;
+                len_deletions1 = len_deletions2;
+                len_insertions2 = 0;
+                len_deletions2 = 0;
+                last_equality = Some((text1, text2));
+                pointer += 1;
+                continue;
+            }
+            Diff::Delete(text) => len_deletions2 += text.len,
+            Diff::Insert(text) => len_insertions2 += text.len,
+        }
+        // Eliminate an equality that is smaller or equal to the edits on both
+        // sides of it.
+        if last_equality.map_or(false, |(last_equality, _)| {
+            last_equality.len <= cmp::max(len_insertions1, len_deletions1)
+                && last_equality.len <= cmp::max(len_insertions2, len_deletions2)
+        }) {
+            // Jump back to offending equality.
+            pointer = equalities.pop_back().unwrap();
+
+            // Replace equality with a delete.
+            diffs[pointer] = Diff::Delete(last_equality.unwrap().0);
+            // Insert a corresponding insert.
+            diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1));
+
+            len_insertions1 = 0; // Reset the counters.
+            len_insertions2 = 0;
+            len_deletions1 = 0;
+            len_deletions2 = 0;
+            last_equality = None;
+            changes = true;
+
+            // Throw away the previous equality (it needs to be reevaluated).
+            equalities.pop_back();
+            if let Some(back) = equalities.back() {
+                // There is a safe equality we can fall back to.
+                pointer = *back;
+            } else {
+                // There are no previous equalities, jump back to the start.
+                pointer = 0;
+                continue;
+            }
+        }
+        pointer += 1;
+    }
+
+    // Normalize the diff.
+    if changes {
+        cleanup_merge(solution);
+    }
+    cleanup_semantic_lossless(solution);
+    diffs = &mut solution.diffs;
+
+    // Find any overlaps between deletions and insertions.
+    // e.g: <del>abcxxx</del><ins>xxxdef</ins>
+    //   -> <del>abc</del>xxx<ins>def</ins>
+    // e.g: <del>xxxabc</del><ins>defxxx</ins>
+    //   -> <ins>def</ins>xxx<del>abc</del>
+    // Only extract an overlap if it is as big as the edit ahead or behind it.
+    let mut pointer = 1;
+    while let Some(&this_diff) = diffs.get(pointer) {
+        let prev_diff = diffs[pointer - 1];
+        if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) {
+            let overlap_len1 = common_overlap(deletion, insertion);
+            let overlap_len2 = common_overlap(insertion, deletion);
+            let overlap_min = cmp::min(deletion.len, insertion.len);
+            if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min {
+                // Overlap found. Insert an equality and trim the surrounding edits.
+                diffs.insert(
+                    pointer,
+                    Diff::Equal(
+                        deletion.substring(deletion.len - overlap_len1..deletion.len),
+                        insertion.substring(..overlap_len1),
+                    ),
+                );
+                diffs[pointer - 1] =
+                    Diff::Delete(deletion.substring(..deletion.len - overlap_len1));
+                diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..));
+            } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min {
+                // Reverse overlap found.
+                // Insert an equality and swap and trim the surrounding edits.
+                diffs.insert(
+                    pointer,
+                    Diff::Equal(
+                        deletion.substring(..overlap_len2),
+                        insertion.substring(insertion.len - overlap_len2..insertion.len),
+                    ),
+                );
+                diffs[pointer - 1] =
+                    Diff::Insert(insertion.substring(..insertion.len - overlap_len2));
+                diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..));
+            }
+            pointer += 1;
+        }
+        pointer += 1;
+    }
+}
+
+// Look for single edits surrounded on both sides by equalities which can be
+// shifted sideways to align the edit to a word boundary.
+//
+// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
+fn cleanup_semantic_lossless(solution: &mut Solution) {
+    let diffs = &mut solution.diffs;
+    let mut pointer = 1;
+    while let Some(&next_diff) = diffs.get(pointer + 1) {
+        let prev_diff = diffs[pointer - 1];
+        if let (
+            Diff::Equal(mut prev_equal1, mut prev_equal2),
+            Diff::Equal(mut next_equal1, mut next_equal2),
+        ) = (prev_diff, next_diff)
+        {
+            // This is a single edit surrounded by equalities.
+            let mut edit = diffs[pointer];
+
+            // First, shift the edit as far left as possible.
+            let common_offset = common_suffix(prev_equal1, edit.text());
+            let original_prev_len = prev_equal1.len;
+            prev_equal1.len -= common_offset;
+            prev_equal2.len -= common_offset;
+            edit.shift_left(common_offset);
+            next_equal1.offset -= common_offset;
+            next_equal1.len += common_offset;
+            next_equal2.offset -= common_offset;
+            next_equal2.len += common_offset;
+
+            // Second, step character by character right, looking for the best fit.
+            let mut best_prev_equal = (prev_equal1, prev_equal2);
+            let mut best_edit = edit;
+            let mut best_next_equal = (next_equal1, next_equal2);
+            let mut best_score = cleanup_semantic_score(prev_equal1, edit.text())
+                + cleanup_semantic_score(edit.text(), next_equal1);
+            while !edit.text().is_empty()
+                && !next_equal1.is_empty()
+                && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap()
+            {
+                prev_equal1.len += 1;
+                prev_equal2.len += 1;
+                edit.shift_right(1);
+                next_equal1.offset += 1;
+                next_equal1.len -= 1;
+                next_equal2.offset += 1;
+                next_equal2.len -= 1;
+                let score = cleanup_semantic_score(prev_equal1, edit.text())
+                    + cleanup_semantic_score(edit.text(), next_equal1);
+                // The >= encourages trailing rather than leading whitespace on edits.
+                if score >= best_score {
+                    best_score = score;
+                    best_prev_equal = (prev_equal1, prev_equal2);
+                    best_edit = edit;
+                    best_next_equal = (next_equal1, next_equal2);
+                }
+            }
+
+            if original_prev_len != best_prev_equal.0.len {
+                // We have an improvement, save it back to the diff.
+                if best_next_equal.0.is_empty() {
+                    diffs.remove(pointer + 1);
+                } else {
+                    diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1);
+                }
+                diffs[pointer] = best_edit;
+                if best_prev_equal.0.is_empty() {
+                    diffs.remove(pointer - 1);
+                    pointer -= 1;
+                } else {
+                    diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1);
+                }
+            }
+        }
+        pointer += 1;
+    }
+}
+
+// Given two strings, compute a score representing whether the internal boundary
+// falls on logical boundaries.
+//
+// Scores range from 6 (best) to 0 (worst).
+fn cleanup_semantic_score(one: Range, two: Range) -> usize {
+    if one.is_empty() || two.is_empty() {
+        // Edges are the best.
+        return 6;
+    }
+
+    // Each port of this function behaves slightly differently due to subtle
+    // differences in each language's definition of things like 'whitespace'.
+    // Since this function's purpose is largely cosmetic, the choice has been
+    // made to use each language's native features rather than force total
+    // conformity.
+    let char1 = one.chars().next_back().unwrap();
+    let char2 = two.chars().next().unwrap();
+    let non_alphanumeric1 = !char1.is_ascii_alphanumeric();
+    let non_alphanumeric2 = !char2.is_ascii_alphanumeric();
+    let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace();
+    let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace();
+    let line_break1 = whitespace1 && char1.is_control();
+    let line_break2 = whitespace2 && char2.is_control();
+    let blank_line1 =
+        line_break1 && (one.ends_with(['\n', '\n']) || one.ends_with(['\n', '\r', '\n']));
+    let blank_line2 =
+        line_break2 && (two.starts_with(['\n', '\n']) || two.starts_with(['\r', '\n', '\r', '\n']));
+
+    if blank_line1 || blank_line2 {
+        // Five points for blank lines.
+        5
+    } else if line_break1 || line_break2 {
+        // Four points for line breaks.
+        4
+    } else if non_alphanumeric1 && !whitespace1 && whitespace2 {
+        // Three points for end of sentences.
+        3
+    } else if whitespace1 || whitespace2 {
+        // Two points for whitespace.
+        2
+    } else if non_alphanumeric1 || non_alphanumeric2 {
+        // One point for non-alphanumeric.
+        1
+    } else {
+        0
+    }
+}
+
+// Reorder and merge like edit sections. Merge equalities. Any edit section can
+// move as long as it doesn't cross an equality.
+fn cleanup_merge(solution: &mut Solution) {
+    let diffs = &mut solution.diffs;
+    while !diffs.is_empty() {
+        diffs.push(Diff::Equal(
+            solution.text1.substring(solution.text1.len..),
+            solution.text2.substring(solution.text2.len..),
+        )); // Add a dummy entry at the end.
+        let mut pointer = 0;
+        let mut count_delete = 0;
+        let mut count_insert = 0;
+        let mut text_delete = Range::empty();
+        let mut text_insert = Range::empty();
+        while let Some(&this_diff) = diffs.get(pointer) {
+            match this_diff {
+                Diff::Insert(text) => {
+                    count_insert += 1;
+                    if text_insert.is_empty() {
+                        text_insert = text;
+                    } else {
+                        text_insert.len += text.len;
+                    }
+                }
+                Diff::Delete(text) => {
+                    count_delete += 1;
+                    if text_delete.is_empty() {
+                        text_delete = text;
+                    } else {
+                        text_delete.len += text.len;
+                    }
+                }
+                Diff::Equal(text, _) => {
+                    let count_both = count_delete + count_insert;
+                    if count_both > 1 {
+                        let both_types = count_delete != 0 && count_insert != 0;
+                        // Delete the offending records.
+                        diffs.splice(pointer - count_both..pointer, None);
+                        pointer -= count_both;
+                        if both_types {
+                            // Factor out any common prefix.
+                            let common_length = common_prefix(text_insert, text_delete);
+                            if common_length != 0 {
+                                if pointer > 0 {
+                                    match &mut diffs[pointer - 1] {
+                                        Diff::Equal(this_diff1, this_diff2) => {
+                                            this_diff1.len += common_length;
+                                            this_diff2.len += common_length;
+                                        }
+                                        _ => unreachable!(
+                                            "previous diff should have been an equality"
+                                        ),
+                                    }
+                                } else {
+                                    diffs.insert(
+                                        pointer,
+                                        Diff::Equal(
+                                            text_delete.substring(..common_length),
+                                            text_insert.substring(..common_length),
+                                        ),
+                                    );
+                                    pointer += 1;
+                                }
+                                text_insert = text_insert.substring(common_length..);
+                                text_delete = text_delete.substring(common_length..);
+                            }
+                            // Factor out any common suffix.
+                            let common_length = common_suffix(text_insert, text_delete);
+                            if common_length != 0 {
+                                diffs[pointer].grow_left(common_length);
+                                text_insert.len -= common_length;
+                                text_delete.len -= common_length;
+                            }
+                        }
+                        // Insert the merged records.
+                        if !text_delete.is_empty() {
+                            diffs.insert(pointer, Diff::Delete(text_delete));
+                            pointer += 1;
+                        }
+                        if !text_insert.is_empty() {
+                            diffs.insert(pointer, Diff::Insert(text_insert));
+                            pointer += 1;
+                        }
+                    } else if pointer > 0 {
+                        if let Some(Diff::Equal(prev_equal1, prev_equal2)) =
+                            diffs.get_mut(pointer - 1)
+                        {
+                            // Merge this equality with the previous one.
+                            prev_equal1.len += text.len;
+                            prev_equal2.len += text.len;
+                            diffs.remove(pointer);
+                            pointer -= 1;
+                        }
+                    }
+                    count_insert = 0;
+                    count_delete = 0;
+                    text_delete = Range::empty();
+                    text_insert = Range::empty();
+                }
+            }
+            pointer += 1;
+        }
+        if diffs.last().unwrap().text().is_empty() {
+            diffs.pop(); // Remove the dummy entry at the end.
+        }
+
+        // Second pass: look for single edits surrounded on both sides by equalities
+        // which can be shifted sideways to eliminate an equality.
+        // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
+        let mut changes = false;
+        let mut pointer = 1;
+        // Intentionally ignore the first and last element (don't need checking).
+        while let Some(&next_diff) = diffs.get(pointer + 1) {
+            let prev_diff = diffs[pointer - 1];
+            let this_diff = diffs[pointer];
+            if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) {
+                // This is a single edit surrounded by equalities.
+                if this_diff.text().ends_with(prev_diff) {
+                    // Shift the edit over the previous equality.
+                    diffs[pointer].shift_left(prev_diff.len);
+                    diffs[pointer + 1].grow_left(prev_diff.len);
+                    diffs.remove(pointer - 1); // Delete prev_diff.
+                    changes = true;
+                } else if this_diff.text().starts_with(next_diff) {
+                    // Shift the edit over the next equality.
+                    diffs[pointer - 1].grow_right(next_diff.len);
+                    diffs[pointer].shift_right(next_diff.len);
+                    diffs.remove(pointer + 1); // Delete next_diff.
+                    changes = true;
+                }
+            }
+            pointer += 1;
+        }
+        // If shifts were made, the diff needs reordering and another shift sweep.
+        if !changes {
+            return;
+        }
+    }
+}
+
+impl Debug for Chunk<'_> {
+    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+        let (name, text) = match *self {
+            Chunk::Equal(text) => ("Equal", text),
+            Chunk::Delete(text) => ("Delete", text),
+            Chunk::Insert(text) => ("Insert", text),
+        };
+        write!(formatter, "{}({:?})", name, text)
+    }
+}
+
+impl Debug for Diff<'_, '_> {
+    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+        let (name, range) = match *self {
+            Diff::Equal(range, _) => ("Equal", range),
+            Diff::Delete(range) => ("Delete", range),
+            Diff::Insert(range) => ("Insert", range),
+        };
+        formatter.write_str(name)?;
+        formatter.write_str("(\"")?;
+        for ch in range.chars() {
+            if ch == '\'' {
+                // escape_debug turns this into "\'" which is unnecessary.
+                formatter.write_char(ch)?;
+            } else {
+                Display::fmt(&ch.escape_debug(), formatter)?;
+            }
+        }
+        formatter.write_str("\")")?;
+        Ok(())
+    }
+}
diff --git a/src/range.rs b/src/range.rs

new file mode 100644 (file)

index 0000000..55cbc44
--- /dev/null
+++ b/src/range.rs
@@ -0,0 +1,141 @@
+use crate::find::find;
+use std::fmt::Debug;
+use std::ops::{self, RangeFrom, RangeFull, RangeTo};
+
+#[derive(Copy, Clone)]
+pub struct Range<'a> {
+    pub doc: &'a [char],
+    pub offset: usize,
+    pub len: usize,
+}
+
+impl<'a> Range<'a> {
+    pub fn empty() -> Self {
+        Range {
+            doc: &[],
+            offset: 0,
+            len: 0,
+        }
+    }
+
+    pub fn new(doc: &'a [char], bounds: impl RangeBounds) -> Self {
+        let (offset, len) = bounds.index(doc.len());
+        Range { doc, offset, len }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    pub fn len_bytes(&self) -> usize {
+        self.chars().map(char::len_utf8).sum()
+    }
+
+    pub fn substring(&self, bounds: impl RangeBounds) -> Self {
+        let (offset, len) = bounds.index(self.len);
+        Range {
+            doc: self.doc,
+            offset: self.offset + offset,
+            len,
+        }
+    }
+
+    pub fn get(&self, bounds: impl RangeBounds) -> Option<Self> {
+        let (offset, len) = bounds.try_index(self.len)?;
+        Some(Range {
+            doc: self.doc,
+            offset: self.offset + offset,
+            len,
+        })
+    }
+
+    pub fn split_at(&self, mid: usize) -> (Self, Self) {
+        (self.substring(..mid), self.substring(mid..))
+    }
+
+    pub fn chars(
+        &self,
+    ) -> impl Iterator<Item = char> + DoubleEndedIterator + ExactSizeIterator + 'a {
+        slice(*self).iter().copied()
+    }
+
+    pub fn starts_with(&self, prefix: impl AsRef<[char]>) -> bool {
+        slice(*self).starts_with(prefix.as_ref())
+    }
+
+    pub fn ends_with(&self, suffix: impl AsRef<[char]>) -> bool {
+        slice(*self).ends_with(suffix.as_ref())
+    }
+
+    pub fn find(&self, needle: impl AsRef<[char]>) -> Option<usize> {
+        find(slice(*self), needle.as_ref())
+    }
+}
+
+pub fn slice(range: Range) -> &[char] {
+    if cfg!(debug)
+        && range
+            .doc
+            .get(range.offset..range.offset + range.len)
+            .is_none()
+    {
+        eprintln!(
+            "doc={:?} offset={} len={}",
+            range.doc, range.offset, range.len
+        );
+    }
+    &range.doc[range.offset..range.offset + range.len]
+}
+
+impl AsRef<[char]> for Range<'_> {
+    fn as_ref(&self) -> &[char] {
+        slice(*self)
+    }
+}
+
+pub trait RangeBounds: Sized + Clone + Debug {
+    // Returns (offset, len).
+    fn try_index(self, len: usize) -> Option<(usize, usize)>;
+    fn index(self, len: usize) -> (usize, usize) {
+        match self.clone().try_index(len) {
+            Some(range) => range,
+            None => panic!("index out of range, index={:?}, len={}", self, len),
+        }
+    }
+}
+
+impl RangeBounds for ops::Range<usize> {
+    fn try_index(self, len: usize) -> Option<(usize, usize)> {
+        if self.start <= self.end && self.end <= len {
+            Some((self.start, self.end - self.start))
+        } else {
+            None
+        }
+    }
+}
+
+impl RangeBounds for RangeFrom<usize> {
+    fn try_index(self, len: usize) -> Option<(usize, usize)> {
+        if self.start <= len {
+            Some((self.start, len - self.start))
+        } else {
+            None
+        }
+    }
+}
+
+impl RangeBounds for RangeTo<usize> {
+    fn try_index(self, len: usize) -> Option<(usize, usize)> {
+        if self.end <= len {
+            Some((0, self.end))
+        } else {
+            None
+        }
+    }
+}
+
+impl RangeBounds for RangeFull {
+    fn try_index(self, len: usize) -> Option<(usize, usize)> {
+        Some((0, len))
+    }
+}
diff --git a/src/tests.rs b/src/tests.rs

new file mode 100644 (file)

index 0000000..d2e3fd6
--- /dev/null
+++ b/src/tests.rs
@@ -0,0 +1,591 @@
+use super::*;
+use once_cell::sync::OnceCell;
+
+macro_rules! range {
+    ($text:expr) => {{
+        static CHARS: OnceCell<Vec<char>> = OnceCell::new();
+        let chars = CHARS.get_or_init(|| $text.chars().collect());
+        Range::new(chars, ..)
+    }};
+}
+
+macro_rules! diff_list {
+    () => {
+        Solution {
+            text1: Range::empty(),
+            text2: Range::empty(),
+            diffs: Vec::new(),
+        }
+    };
+    ($($kind:ident($text:literal)),+ $(,)?) => {{
+        #[allow(unused_macro_rules)]
+        macro_rules! text1 {
+            (Insert, $s:literal) => { "" };
+            (Delete, $s:literal) => { $s };
+            (Equal, $s:literal) => { $s };
+        }
+        #[allow(unused_macro_rules)]
+        macro_rules! text2 {
+            (Insert, $s:literal) => { $s };
+            (Delete, $s:literal) => { "" };
+            (Equal, $s:literal) => { $s };
+        }
+        let text1 = range!(concat!($(text1!($kind, $text)),*));
+        let text2 = range!(concat!($(text2!($kind, $text)),*));
+        let (_i, _j) = (&mut 0, &mut 0);
+        #[allow(unused_macro_rules)]
+        macro_rules! range {
+            (Insert, $s:literal) => {
+                Diff::Insert(range(text2.doc, _j, $s))
+            };
+            (Delete, $s:literal) => {
+                Diff::Delete(range(text1.doc, _i, $s))
+            };
+            (Equal, $s:literal) => {
+                Diff::Equal(range(text1.doc, _i, $s), range(text2.doc, _j, $s))
+            };
+        }
+        Solution {
+            text1,
+            text2,
+            diffs: vec![$(range!($kind, $text)),*],
+        }
+    }};
+}
+
+fn range<'a>(doc: &'a [char], offset: &mut usize, text: &str) -> Range<'a> {
+    let len = text.chars().count();
+    let range = Range {
+        doc,
+        offset: *offset,
+        len,
+    };
+    *offset += len;
+    range
+}
+
+macro_rules! assert_diffs {
+    ([$($kind:ident($text:literal)),* $(,)?], $solution:ident, $msg:expr $(,)?) => {
+        let expected = &[$(Chunk::$kind($text)),*];
+        assert!(
+            same_diffs(expected, &$solution.diffs),
+            concat!($msg, "\nexpected={:#?}\nactual={:#?}"),
+            expected, $solution.diffs,
+        );
+    };
+}
+
+fn same_diffs(expected: &[Chunk], actual: &[Diff]) -> bool {
+    fn eq(expected: &str, actual: &Range) -> bool {
+        expected.chars().eq(slice(*actual).iter().copied())
+    }
+
+    expected.len() == actual.len()
+        && expected.iter().zip(actual).all(|pair| match pair {
+            (Chunk::Insert(expected), Diff::Insert(actual)) => eq(expected, actual),
+            (Chunk::Delete(expected), Diff::Delete(actual)) => eq(expected, actual),
+            (Chunk::Equal(expected), Diff::Equal(actual1, actual2)) => {
+                eq(expected, actual1) && eq(expected, actual2)
+            }
+            (_, _) => false,
+        })
+}
+
+#[test]
+fn test_common_prefix() {
+    let text1 = range!("abc");
+    let text2 = range!("xyz");
+    assert_eq!(0, common_prefix(text1, text2), "Null case");
+
+    let text1 = range!("1234abcdef");
+    let text2 = range!("1234xyz");
+    assert_eq!(4, common_prefix(text1, text2), "Non-null case");
+
+    let text1 = range!("1234");
+    let text2 = range!("1234xyz");
+    assert_eq!(4, common_prefix(text1, text2), "Whole case");
+}
+
+#[test]
+fn test_common_suffix() {
+    let text1 = range!("abc");
+    let text2 = range!("xyz");
+    assert_eq!(0, common_suffix(text1, text2), "Null case");
+
+    let text1 = range!("abcdef1234");
+    let text2 = range!("xyz1234");
+    assert_eq!(4, common_suffix(text1, text2), "Non-null case");
+
+    let text1 = range!("1234");
+    let text2 = range!("xyz1234");
+    assert_eq!(4, common_suffix(text1, text2), "Whole case");
+}
+
+#[test]
+fn test_common_overlap() {
+    let text1 = Range::empty();
+    let text2 = range!("abcd");
+    assert_eq!(0, common_overlap(text1, text2), "Null case");
+
+    let text1 = range!("abc");
+    let text2 = range!("abcd");
+    assert_eq!(3, common_overlap(text1, text2), "Whole case");
+
+    let text1 = range!("123456");
+    let text2 = range!("abcd");
+    assert_eq!(0, common_overlap(text1, text2), "No overlap");
+
+    let text1 = range!("123456xxx");
+    let text2 = range!("xxxabcd");
+    assert_eq!(3, common_overlap(text1, text2), "Overlap");
+
+    // Some overly clever languages (C#) may treat ligatures as equal to their
+    // component letters. E.g. U+FB01 == 'fi'
+    let text1 = range!("fi");
+    let text2 = range!("\u{fb01}i");
+    assert_eq!(0, common_overlap(text1, text2), "Unicode");
+}
+
+#[test]
+fn test_cleanup_merge() {
+    let mut solution = diff_list![];
+    cleanup_merge(&mut solution);
+    assert_diffs!([], solution, "Null case");
+
+    let mut solution = diff_list![Equal("a"), Delete("b"), Insert("c")];
+    cleanup_merge(&mut solution);
+    assert_diffs!(
+        [Equal("a"), Delete("b"), Insert("c")],
+        solution,
+        "No change case",
+    );
+
+    let mut solution = diff_list![Equal("a"), Equal("b"), Equal("c")];
+    cleanup_merge(&mut solution);
+    assert_diffs!([Equal("abc")], solution, "Merge equalities");
+
+    let mut solution = diff_list![Delete("a"), Delete("b"), Delete("c")];
+    cleanup_merge(&mut solution);
+    assert_diffs!([Delete("abc")], solution, "Merge deletions");
+
+    let mut solution = diff_list![Insert("a"), Insert("b"), Insert("c")];
+    cleanup_merge(&mut solution);
+    assert_diffs!([Insert("abc")], solution, "Merge insertions");
+
+    let mut solution = diff_list![
+        Delete("a"),
+        Insert("b"),
+        Delete("c"),
+        Insert("d"),
+        Equal("e"),
+        Equal("f"),
+    ];
+    cleanup_merge(&mut solution);
+    assert_diffs!(
+        [Delete("ac"), Insert("bd"), Equal("ef")],
+        solution,
+        "Merge interweave",
+    );
+
+    let mut solution = diff_list![Delete("a"), Insert("abc"), Delete("dc")];
+    cleanup_merge(&mut solution);
+    assert_diffs!(
+        [Equal("a"), Delete("d"), Insert("b"), Equal("c")],
+        solution,
+        "Prefix and suffix detection",
+    );
+
+    let mut solution = diff_list![
+        Equal("x"),
+        Delete("a"),
+        Insert("abc"),
+        Delete("dc"),
+        Equal("y"),
+    ];
+    cleanup_merge(&mut solution);
+    assert_diffs!(
+        [Equal("xa"), Delete("d"), Insert("b"), Equal("cy")],
+        solution,
+        "Prefix and suffix detection with equalities",
+    );
+
+    let mut solution = diff_list![Equal("a"), Insert("ba"), Equal("c")];
+    cleanup_merge(&mut solution);
+    assert_diffs!([Insert("ab"), Equal("ac")], solution, "Slide edit left");
+
+    let mut solution = diff_list![Equal("c"), Insert("ab"), Equal("a")];
+    cleanup_merge(&mut solution);
+    assert_diffs!([Equal("ca"), Insert("ba")], solution, "Slide edit right");
+
+    let mut solution = diff_list![
+        Equal("a"),
+        Delete("b"),
+        Equal("c"),
+        Delete("ac"),
+        Equal("x"),
+    ];
+    cleanup_merge(&mut solution);
+    assert_diffs!(
+        [Delete("abc"), Equal("acx")],
+        solution,
+        "Slide edit left recursive",
+    );
+
+    let mut solution = diff_list![
+        Equal("x"),
+        Delete("ca"),
+        Equal("c"),
+        Delete("b"),
+        Equal("a"),
+    ];
+    cleanup_merge(&mut solution);
+    assert_diffs!(
+        [Equal("xca"), Delete("cba")],
+        solution,
+        "Slide edit right recursive",
+    );
+
+    let mut solution = diff_list![Delete("b"), Insert("ab"), Equal("c")];
+    cleanup_merge(&mut solution);
+    assert_diffs!([Insert("a"), Equal("bc")], solution, "Empty range");
+
+    let mut solution = diff_list![Equal(""), Insert("a"), Equal("b")];
+    cleanup_merge(&mut solution);
+    assert_diffs!([Insert("a"), Equal("b")], solution, "Empty equality");
+}
+
+#[test]
+fn test_cleanup_semantic_lossless() {
+    let mut solution = diff_list![];
+    cleanup_semantic_lossless(&mut solution);
+    assert_diffs!([], solution, "Null case");
+
+    let mut solution = diff_list![
+        Equal("AAA\r\n\r\nBBB"),
+        Insert("\r\nDDD\r\n\r\nBBB"),
+        Equal("\r\nEEE"),
+    ];
+    cleanup_semantic_lossless(&mut solution);
+    assert_diffs!(
+        [
+            Equal("AAA\r\n\r\n"),
+            Insert("BBB\r\nDDD\r\n\r\n"),
+            Equal("BBB\r\nEEE"),
+        ],
+        solution,
+        "Blank lines",
+    );
+
+    let mut solution = diff_list![Equal("AAA\r\nBBB"), Insert(" DDD\r\nBBB"), Equal(" EEE")];
+    cleanup_semantic_lossless(&mut solution);
+    assert_diffs!(
+        [Equal("AAA\r\n"), Insert("BBB DDD\r\n"), Equal("BBB EEE")],
+        solution,
+        "Line boundaries",
+    );
+
+    let mut solution = diff_list![Equal("The c"), Insert("ow and the c"), Equal("at.")];
+    cleanup_semantic_lossless(&mut solution);
+    assert_diffs!(
+        [Equal("The "), Insert("cow and the "), Equal("cat.")],
+        solution,
+        "Word boundaries",
+    );
+
+    let mut solution = diff_list![Equal("The-c"), Insert("ow-and-the-c"), Equal("at.")];
+    cleanup_semantic_lossless(&mut solution);
+    assert_diffs!(
+        [Equal("The-"), Insert("cow-and-the-"), Equal("cat.")],
+        solution,
+        "Alphanumeric boundaries",
+    );
+
+    let mut solution = diff_list![Equal("a"), Delete("a"), Equal("ax")];
+    cleanup_semantic_lossless(&mut solution);
+    assert_diffs!([Delete("a"), Equal("aax")], solution, "Hitting the start");
+
+    let mut solution = diff_list![Equal("xa"), Delete("a"), Equal("a")];
+    cleanup_semantic_lossless(&mut solution);
+    assert_diffs!([Equal("xaa"), Delete("a")], solution, "Hitting the end");
+
+    let mut solution = diff_list![Equal("The xxx. The "), Insert("zzz. The "), Equal("yyy.")];
+    cleanup_semantic_lossless(&mut solution);
+    assert_diffs!(
+        [Equal("The xxx."), Insert(" The zzz."), Equal(" The yyy.")],
+        solution,
+        "Sentence boundaries",
+    );
+}
+
+#[test]
+fn test_cleanup_semantic() {
+    let mut solution = diff_list![];
+    cleanup_semantic(&mut solution);
+    assert_diffs!([], solution, "Null case");
+
+    let mut solution = diff_list![Delete("ab"), Insert("cd"), Equal("12"), Delete("e")];
+    cleanup_semantic(&mut solution);
+    assert_diffs!(
+        [Delete("ab"), Insert("cd"), Equal("12"), Delete("e")],
+        solution,
+        "No elimination #1",
+    );
+
+    let mut solution = diff_list![Delete("abc"), Insert("ABC"), Equal("1234"), Delete("wxyz")];
+    cleanup_semantic(&mut solution);
+    assert_diffs!(
+        [Delete("abc"), Insert("ABC"), Equal("1234"), Delete("wxyz")],
+        solution,
+        "No elimination #2",
+    );
+
+    let mut solution = diff_list![Delete("a"), Equal("b"), Delete("c")];
+    cleanup_semantic(&mut solution);
+    assert_diffs!([Delete("abc"), Insert("b")], solution, "Simple elimination",);
+
+    let mut solution = diff_list![
+        Delete("ab"),
+        Equal("cd"),
+        Delete("e"),
+        Equal("f"),
+        Insert("g"),
+    ];
+    cleanup_semantic(&mut solution);
+    assert_diffs!(
+        [Delete("abcdef"), Insert("cdfg")],
+        solution,
+        "Backpass elimination",
+    );
+
+    let mut solution = diff_list![
+        Insert("1"),
+        Equal("A"),
+        Delete("B"),
+        Insert("2"),
+        Equal("_"),
+        Insert("1"),
+        Equal("A"),
+        Delete("B"),
+        Insert("2"),
+    ];
+    cleanup_semantic(&mut solution);
+    assert_diffs!(
+        [Delete("AB_AB"), Insert("1A2_1A2")],
+        solution,
+        "Multiple elimination",
+    );
+
+    let mut solution = diff_list![Equal("The c"), Delete("ow and the c"), Equal("at.")];
+    cleanup_semantic(&mut solution);
+    assert_diffs!(
+        [Equal("The "), Delete("cow and the "), Equal("cat.")],
+        solution,
+        "Word boundaries",
+    );
+
+    let mut solution = diff_list![Delete("abcxx"), Insert("xxdef")];
+    cleanup_semantic(&mut solution);
+    assert_diffs!(
+        [Delete("abcxx"), Insert("xxdef")],
+        solution,
+        "No overlap elimination",
+    );
+
+    let mut solution = diff_list![Delete("abcxxx"), Insert("xxxdef")];
+    cleanup_semantic(&mut solution);
+    assert_diffs!(
+        [Delete("abc"), Equal("xxx"), Insert("def")],
+        solution,
+        "Overlap elimination",
+    );
+
+    let mut solution = diff_list![Delete("xxxabc"), Insert("defxxx")];
+    cleanup_semantic(&mut solution);
+    assert_diffs!(
+        [Insert("def"), Equal("xxx"), Delete("abc")],
+        solution,
+        "Reverse overlap elimination",
+    );
+
+    let mut solution = diff_list![
+        Delete("abcd1212"),
+        Insert("1212efghi"),
+        Equal("----"),
+        Delete("A3"),
+        Insert("3BC"),
+    ];
+    cleanup_semantic(&mut solution);
+    assert_diffs!(
+        [
+            Delete("abcd"),
+            Equal("1212"),
+            Insert("efghi"),
+            Equal("----"),
+            Delete("A"),
+            Equal("3"),
+            Insert("BC"),
+        ],
+        solution,
+        "Two overlap eliminations",
+    );
+}
+
+#[test]
+fn test_bisect() {
+    let text1 = range!("cat");
+    let text2 = range!("map");
+    let solution = Solution {
+        text1,
+        text2,
+        diffs: bisect(text1, text2),
+    };
+    assert_diffs!(
+        [
+            Delete("c"),
+            Insert("m"),
+            Equal("a"),
+            Delete("t"),
+            Insert("p"),
+        ],
+        solution,
+        "Normal",
+    );
+}
+
+#[test]
+fn test_main() {
+    let solution = main(Range::empty(), Range::empty());
+    assert_diffs!([], solution, "Null case");
+
+    let solution = main(range!("abc"), range!("abc"));
+    assert_diffs!([Equal("abc")], solution, "Equality");
+
+    let solution = main(range!("abc"), range!("ab123c"));
+    assert_diffs!(
+        [Equal("ab"), Insert("123"), Equal("c")],
+        solution,
+        "Simple insertion",
+    );
+
+    let solution = main(range!("a123bc"), range!("abc"));
+    assert_diffs!(
+        [Equal("a"), Delete("123"), Equal("bc")],
+        solution,
+        "Simple deletion",
+    );
+
+    let solution = main(range!("abc"), range!("a123b456c"));
+    assert_diffs!(
+        [
+            Equal("a"),
+            Insert("123"),
+            Equal("b"),
+            Insert("456"),
+            Equal("c"),
+        ],
+        solution,
+        "Two insertions",
+    );
+
+    let solution = main(range!("a123b456c"), range!("abc"));
+    assert_diffs!(
+        [
+            Equal("a"),
+            Delete("123"),
+            Equal("b"),
+            Delete("456"),
+            Equal("c"),
+        ],
+        solution,
+        "Two deletions",
+    );
+
+    let solution = main(range!("a"), range!("b"));
+    assert_diffs!([Delete("a"), Insert("b")], solution, "Simple case #1");
+
+    let solution = main(
+        range!("Apples are a fruit."),
+        range!("Bananas are also fruit."),
+    );
+    assert_diffs!(
+        [
+            Delete("Apple"),
+            Insert("Banana"),
+            Equal("s are a"),
+            Insert("lso"),
+            Equal(" fruit."),
+        ],
+        solution,
+        "Simple case #2",
+    );
+
+    let solution = main(range!("ax\t"), range!("\u{0680}x\000"));
+    assert_diffs!(
+        [
+            Delete("a"),
+            Insert("\u{0680}"),
+            Equal("x"),
+            Delete("\t"),
+            Insert("\000"),
+        ],
+        solution,
+        "Simple case #3",
+    );
+
+    let solution = main(range!("1ayb2"), range!("abxab"));
+    assert_diffs!(
+        [
+            Delete("1"),
+            Equal("a"),
+            Delete("y"),
+            Equal("b"),
+            Delete("2"),
+            Insert("xab"),
+        ],
+        solution,
+        "Overlap #1",
+    );
+
+    let solution = main(range!("abcy"), range!("xaxcxabc"));
+    assert_diffs!(
+        [Insert("xaxcx"), Equal("abc"), Delete("y")],
+        solution,
+        "Overlap #2",
+    );
+
+    let solution = main(
+        range!("ABCDa=bcd=efghijklmnopqrsEFGHIJKLMNOefg"),
+        range!("a-bcd-efghijklmnopqrs"),
+    );
+    assert_diffs!(
+        [
+            Delete("ABCD"),
+            Equal("a"),
+            Delete("="),
+            Insert("-"),
+            Equal("bcd"),
+            Delete("="),
+            Insert("-"),
+            Equal("efghijklmnopqrs"),
+            Delete("EFGHIJKLMNOefg"),
+        ],
+        solution,
+        "Overlap #3",
+    );
+
+    let solution = main(
+        range!("a [[Pennsylvania]] and [[New"),
+        range!(" and [[Pennsylvania]]"),
+    );
+    assert_diffs!(
+        [
+            Insert(" "),
+            Equal("a"),
+            Insert("nd"),
+            Equal(" [[Pennsylvania]]"),
+            Delete(" and [[New"),
+        ],
+        solution,
+        "Large equality",
+    );
+}
diff --git a/tests/test.rs b/tests/test.rs

new file mode 100644 (file)

index 0000000..7debb05
--- /dev/null
+++ b/tests/test.rs
@@ -0,0 +1,52 @@
+// Upstream diff-match-patch's test suite is imported as unit tests in
+// src/tests.rs, as they test APIs which are private in the Rust implementation.
+//
+// This directory is for Rust-specific integration tests and regression tests.
+
+#![allow(clippy::non_ascii_literal)]
+
+use dissimilar::{diff, Chunk};
+
+#[test]
+fn test_unicode() {
+    // Unicode snowman and unicode comet have the same first two bytes. A
+    // byte-based diff would produce a 2-byte Equal followed by 1-byte Delete
+    // and Insert.
+    let snowman = "\u{2603}";
+    let comet = "\u{2604}";
+    assert_eq!(snowman.as_bytes()[..2], comet.as_bytes()[..2]);
+
+    let d = diff(snowman, comet);
+    assert_eq!(d, vec![Chunk::Delete(snowman), Chunk::Insert(comet)]);
+}
+
+#[test]
+fn test_issue9() {
+    let a = "[乀丁abcd一]";
+    let b = "[一abcd丁]";
+    let d = diff(a, b);
+    assert_eq!(
+        d,
+        vec![
+            Chunk::Equal("["),
+            Chunk::Delete("乀丁"),
+            Chunk::Insert("一"),
+            Chunk::Equal("abcd"),
+            Chunk::Delete("一"),
+            Chunk::Insert("丁"),
+            Chunk::Equal("]"),
+        ],
+    );
+}
+
+#[test]
+fn test_issue15() {
+    let a = "A のダ";
+    let b = "A ダ";
+    let d = diff(a, b);
+
+    assert_eq!(
+        d,
+        vec![Chunk::Equal("A "), Chunk::Delete("の"), Chunk::Equal("ダ")],
+    );
+}
author	Woohyun Jung <wh0705.jung@samsung.com>
	Tue, 14 Mar 2023 09:28:29 +0000 (18:28 +0900)
committer	Woohyun Jung <wh0705.jung@samsung.com>
	Tue, 14 Mar 2023 09:28:29 +0000 (18:28 +0900)
.cargo_vcs_info.json	[new file with mode: 0644]	patch \| blob
.clippy.toml	[new file with mode: 0644]	patch \| blob
.github/FUNDING.yml	[new file with mode: 0644]	patch \| blob
.github/workflows/ci.yml	[new file with mode: 0644]	patch \| blob
.gitignore	[new file with mode: 0644]	patch \| blob
Cargo.toml	[new file with mode: 0644]	patch \| blob
Cargo.toml.orig	[new file with mode: 0644]	patch \| blob
LICENSE-APACHE	[new file with mode: 0644]	patch \| blob
LICENSE-MIT	[new file with mode: 0644]	patch \| blob
README.md	[new file with mode: 0644]	patch \| blob
benches/bench.rs	[new file with mode: 0644]	patch \| blob
benches/document1.txt	[new file with mode: 0644]	patch \| blob
benches/document2.txt	[new file with mode: 0644]	patch \| blob
src/find.rs	[new file with mode: 0644]	patch \| blob
src/lib.rs	[new file with mode: 0644]	patch \| blob
src/range.rs	[new file with mode: 0644]	patch \| blob
src/tests.rs	[new file with mode: 0644]	patch \| blob
tests/test.rs	[new file with mode: 0644]	patch \| blob