文本比较算法

0

最近我是在网上看过一个例子,突然就有点兴趣了,于是乎自己就写了一个算法。

先列出网上的一个例子:http://wenbenbijiao.renrensousuo.com/#diff

这个是一个在线文本编辑的,但是感觉有一些问题。例如:

"A", "B", "C", "A", "C", "A", "D", "F", "A", "B", "C", "A", "C", "A", "D", "F"
"B", "C", "X", "C", "A", "D", "F", "E", "S", "B", "A", "B", "C", "A", "C", "A"

上面这两个文本,换行后比较结果:

文本比较结果

明显这个是有点问题的。

于是我在网上看了一篇文章:http://blog.csdn.net/sunskyor/article/details/4491656
根据这篇文章里面的算法分析,实现了算法,分析出来的结果比较准确,但是多行以后效率实在是低下,贴出来希望大家能够给出一些意见优化:

package com.acgist.nlp.query.compare;
 
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
 
import org.apache.commons.collections.CollectionUtils;
 
public class Path {
 
    private List<String> source;
    private List<String> target;
 
    private boolean[][] map;
 
    /**
     * 路径
     */
    private List<Points> points = new ArrayList<Points>();
     
    public Path() {
    }
 
    public Path(List<String> source, List<String> target) {
        this.source = source;
        this.target = target;
    }
 
    public static void main(String[] args) {
        long b = System.currentTimeMillis();
//      Path path = new Path(Arrays.asList(new String[] {"B", "C", "X", "C", "A", "D", "F", "E", "S", "B", "A", "B", "C", "A", "C", "A"}), Arrays.asList(new String[]{"A", "B", "C", "A", "C", "A", "D", "F"}));
//      Path path = new Path(Arrays.asList(new String[] {"A", "B", "C", "A", "C", "A", "D", "F"}), Arrays.asList(new String[]{"B", "C", "X", "C", "A", "D", "F", "E", "S", "B", "A", "B", "C", "A", "C", "A"}));
        Path path = new Path(Arrays.asList(new String[]{"A", "B", "C", "A", "C", "A", "D", "F", "A", "B", "C", "A", "C", "A", "D", "F"}), Arrays.asList(new String[] {"B", "C", "X", "C", "A", "D", "F", "E", "S", "B", "A", "B", "C", "A", "C", "A"}));
        path.execute();
        long e = System.currentTimeMillis();
        System.out.println("消耗时间:" + (e - b) + "ms");
    }
 
    /**
     * 执行比较
     */
    public void execute() {
        if (CollectionUtils.isEmpty(source) || CollectionUtils.isEmpty(target))
            return;
        map = new boolean[this.source.size()][this.target.size()];
        for (int x = 0; x < this.source.size(); x++) {
            for (int y = 0; y < this.target.size(); y++) {
                if (this.source.get(x) == this.target.get(y) || this.source.get(x).equals(this.target.get(y)))
                    map[x][y] = true;
                else
                    map[x][y] = false;
            }
        }
         
        print(map);
         
         
        for (int x = 0; x < this.source.size(); x++) {
            for (int y = 0; y < this.target.size(); y++) {
                if(map[x][y])
                    this.points.add(find(x, y, new Points(), null));
            }
        }
         
        Collections.sort(this.points, (Points a, Points b) -> {
            if(a == null || b == null)
                return 0;
            if(a.getNumber() >= b.getNumber())
                return 1;
            return 0;
        });
         
        int maxNumber = this.points.get(0).getNumber();
         
        List<Points> maxPoints = new ArrayList<Points>();
        this.points.forEach(points -> {
            if(points.getNumber() == maxNumber)
                maxPoints.add(points);
            else
                return;
        });
         
        this.points.clear();
         
        for (Points points : maxPoints) {
            for (Point point : points.getPoints()) {
                System.out.print(point.getX() + "=" + point.getY() + "=" + point.getSame() + "----------");
            }
            System.out.println();
        }
         
        Points points = optimal(maxPoints);
        for(Point point : points.getPoints()) {
            if(point.getSame())
                System.out.println(this.source.get(point.getX()));
        }
    }
 
    /**
     * 计算最大路径
     */
    public Points find(int x, int y, Points points, Points optimal) {
        if(x == this.source.size() || y == this.target.size()) {
            if(optimal == null)
                return points;
            else if(optimal.getNumber() < points.getNumber())
                return points;
            return optimal;
        } else {
            points.addPoint(new Point(x, y, map[x][y]));
            if(map[x][y]) {
                optimal = find(x + 1, y + 1, new Points(points.getNumber(), points.clonePoints()), optimal);
            } else {
                optimal = find(x, y + 1, new Points(points.getNumber(), points.clonePoints()), optimal);
                optimal = find(x + 1, y, new Points(points.getNumber(), points.clonePoints()), optimal);
                optimal = find(x + 1, y + 1, new Points(points.getNumber(), points.clonePoints()), optimal);
            }
            return optimal;
        }
    }
     
    /**
     * 打印二维数组
     */
    public void print(boolean[][] map) {
        for (boolean[] bs : map) {
            System.out.print("----");
            for (boolean b : bs) {
                System.out.print((b ? "1" : "0") + "----");
            }
            System.out.println();
        }
    }
 
    /**
     * 计算最优路径
     */
    public Points optimal(List<Points> maxPoints){
        int index = 0;
        float similarity = 0;
        for (int i = 0; i < maxPoints.size(); i++) {
            Points points = maxPoints.get(i);
            float tmp = Float.valueOf(points.getNumber()) / Float.valueOf(points.getPoints().size());
            if(tmp > similarity) {
                similarity = tmp;
                index = i;
            }
        }
        return maxPoints.get(index);
    }
     
}
 
class Points {
    private int number;
    private List<Point> points;
     
    public Points() {
    }
 
    public Points(int number, List<Point> points) {
        this.number = number;
        this.points = points;
    }
 
    public void addPoint(Point point) {
        if(points == null)
            points = new ArrayList<>();
        if(point.getSame()) {
            number++;
        }
        points.add(point);
    }
     
    public void clear() {
        this.number = 0;
        this.points.clear();
    }
     
    public boolean isNotEmpty() {
        return CollectionUtils.isNotEmpty(this.points);
    }
     
    public List<Point> clonePoints() {
        return new ArrayList<>(points);
    }
     
    public int getNumber() {
        return number;
    }
     
    public void setNumber(int number) {
        this.number = number;
    }
     
    public List<Point> getPoints() {
        return points;
    }
     
    public void setPoints(List<Point> points) {
        this.points = points;
    }
 
}
 
class Point {
    private int x;
    private int y;
    private boolean same;
 
    public Point() {
    }
 
    public Point(int x, int y, boolean same) {
        this.x = x;
        this.y = y;
        this.same = same;
    }
 
    public int getX() {
        return x;
    }
 
    public void setX(int x) {
        this.x = x;
    }
 
    public int getY() {
        return y;
    }
 
    public void setY(int y) {
        this.y = y;
    }
 
    public boolean getSame() {
        return same;
    }
 
    public void setSame(boolean same) {
        this.same = same;
    }
 
}

下面是比较的结果:

文本比较算法

优化版本:http://www.acgist.com/article/265.html