1 -- Copyright (C) 2002 David Roundy 2 -- Copyright (C) 2005 Benedikt Schmidt 3 -- 4 -- This program is free software; you can redistribute it and/or modify 5 -- it under the terms of the GNU General Public License as published by 6 -- the Free Software Foundation; either version 2, or (at your option) 7 -- any later version. 8 -- 9 -- This program is distributed in the hope that it will be useful, 10 -- but WITHOUT ANY WARRANTY; without even the implied warranty of 11 -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 -- GNU General Public License for more details. 13 -- 14 -- You should have received a copy of the GNU General Public License 15 -- along with this program; see the file COPYING. If not, write to 16 -- the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 17 -- Boston, MA 02110-1301, USA. 18 19 {-# OPTIONS_GHC -cpp #-} 20 {-# LANGUAGE CPP #-} 21 22 -- | LCS stands for Longest Common Subsequence, and it is a relatively 23 -- challenging problem to find an LCS efficiently. This module implements 24 -- the algorithm described in: 25 -- 26 -- "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 27 -- Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; 28 -- especially the variation described in section 4.2 and most refinements 29 -- implemented in GNU diff (D is the edit-distance). 30 -- 31 -- There is currently no heuristic to reduce the running time and produce 32 -- suboptimal output for large inputs with many differences. It behaves like 33 -- GNU diff with the -d option in this regard. 34 -- 35 -- In the first step, a hash value for every line is calculated and collisions 36 -- are marked with a special value. This reduces a string comparison to an 37 -- int comparison for line tuples where at least one of the hash values is 38 -- not equal to the special value. After that, lines which only exists in one 39 -- of the files are removed and marked as changed which reduces the running 40 -- time of the following difference algorithm. GNU diff additionally removes 41 -- lines that appear very often in the other file in some cases. 42 -- The last step tries to create longer changed regions and line up deletions 43 -- in the first file to insertions in the second by shifting changed lines 44 -- forward and backward. 45 module Lcs ( getChanges, aLen, 46 BArray, PArray, BSTArray, 47 shiftBoundaries ) where 48 49 import Control.Monad 50 import Data.Int 51 import Control.Monad.ST 52 import Data.Maybe 53 import ByteStringUtils (hashPS) 54 import qualified Data.ByteString as B (empty, ByteString) 55 import Data.Array.Base 56 import Data.Array.Unboxed 57 import qualified Data.Map as Map ( lookup, empty, insertWith ) 58 #include "impossible.h" 59 60 -- | create a list of changes between a and b, each change has the form 61 -- (starta, lima, startb, limb) which means that a[starta, lima) 62 -- has to be replaced by b[startb, limb) 63 getChanges :: [B.ByteString] -> [B.ByteString] 64 -> [(Int,[B.ByteString],[B.ByteString])] 65 getChanges a b = dropStart (initP a) (initP b) 1 66 67 dropStart :: PArray -> PArray -> Int 68 -> [(Int,[B.ByteString],[B.ByteString])] 69 dropStart a b off 70 | off > (aLen a) = [(off - 1, [], getSlice b off (aLen b))] 71 | off > (aLen b) = [(off - 1, getSlice a off (aLen a), [])] 72 | a!off == b!off = dropStart a b (off + 1) 73 | otherwise = dropEnd a b off 0 74 75 dropEnd :: PArray -> PArray -> Int -> Int 76 -> [(Int,[B.ByteString],[B.ByteString])] 77 dropEnd a b off end 78 | off > alast = [(off - 1, [], getSlice b off blast)] 79 | off > blast = [(off - 1, getSlice a off alast, [])] 80 | a!alast == b!blast = dropEnd a b off (end + 1) 81 | otherwise = getChanges' (a, (off, alast)) (b, (off, blast)) 82 where alast = aLen a - end 83 blast = aLen b - end 84 85 getSlice :: PArray -> Int -> Int -> [B.ByteString] 86 getSlice a from to 87 | from > to = [] 88 | otherwise = (a!(from)):(getSlice a (from + 1) to) 89 90 getChanges' :: (PArray, (Int, Int)) -> (PArray, (Int, Int)) 91 -> [(Int,[B.ByteString],[B.ByteString])] 92 getChanges' (a, abounds) (b, bbounds) = 93 map (convertPatch 0 a b) $ createPatch c_a c_b 94 where 95 -- If the last few characters of two lines are the same, the lines are 96 -- probably the same. The choice of 20 is plucked out of the air. 97 toHash x bnds = listArray bnds [ hashPS $ x!i | i <- range bnds] 98 ah = toHash a abounds :: HArray 99 mkAMap m (i:is) = 100 let ins (_,_,_,new) (collision,_,_,old) = 101 (collision || not (new == old), True, False, old) 102 m' = Map.insertWith ins (ah!i) (False, True, False, a!i) m 103 in mkAMap m' is 104 mkAMap m _ = m 105 hm_a = mkAMap Map.empty (range abounds) 106 -- 107 bh = toHash b bbounds :: HArray 108 mkBMap m (i:is) = 109 let ins (_,_,_,new) (collision,in_a,_,old) = 110 (collision || not (new == old), in_a, True, old) 111 m' = Map.insertWith ins (bh!i) (False, False, True, b!i) m 112 in mkBMap m' is 113 mkBMap m _ = m 114 hm = mkBMap hm_a (range bbounds) 115 -- take care of collisions, if there are different lines with the 116 -- same hash in both files, then set the hash to markColl, 117 -- PackedStrings are compared for two lines with the hash markColl 118 get (i, h) = case Map.lookup h hm of 119 Just (_,False,_,_) -> Nothing 120 Just (_,_,False,_) -> Nothing 121 Just (False,True,True,_) -> Just (i, h) 122 Just (True,True,True,_) -> Just (i, markColl) 123 Nothing -> impossible 124 125 a' = catMaybes $ map get [(i, ah!i) | i <- range (bounds ah)] 126 b' = catMaybes $ map get [(i, bh!i) | i <- range (bounds bh)] 127 128 (c_a, c_b) = diffArr a' b' (a, abounds) (b, bbounds) 129 130 -- | mark hash value where collision occured 131 markColl :: Int32 132 markColl = 2345677 133 134 -- | return arrays with changes in a and b (1 indexed), offsets start with 0 135 diffArr :: [(Int,Int32)] -> [(Int,Int32)] 136 -> (PArray, (Int, Int)) -> (PArray, (Int, Int)) 137 -> (BArray, BArray) 138 diffArr a b (p_a, (off_a, l_a)) (p_b, (off_b, l_b)) = runST ( 139 do let h_a = initH (map snd a) 140 h_b = initH (map snd b) 141 m_a = initM (map fst a) 142 m_b = initM (map fst b) 143 end_a = (aLen p_a) 144 end_b = (aLen p_b) 145 c_a <- initVChanged end_a 146 c_b <- initVChanged end_b 147 mapM_ (\ (l,_) -> writeArray c_a l False) $ a 148 mapM_ (\ (l,_) -> writeArray c_b l False) $ b 149 _ <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b 0 0 (aLen h_a) (aLen h_b) 150 let unchanged ar = do {xs <- getElems ar; return $ (length $ filter not $ xs) -1} 151 err <- liftM2 (/=) (unchanged c_a) (unchanged c_b) 152 when (err) impossible 153 -- Mark common lines at beginning and end 154 mapM_ (\ i -> writeArray c_a i False ) $ [1..(off_a - 1)] 155 mapM_ (\ i -> writeArray c_b i False ) $ [1..(off_b - 1)] 156 mapM_ (\ i -> writeArray c_a i False ) $ [(l_a + 1) .. (end_a)] 157 mapM_ (\ i -> writeArray c_b i False ) $ [(l_b + 1) .. (end_b)] 158 shiftBoundaries c_a c_b p_a 1 1 159 shiftBoundaries c_b c_a p_b 1 1 160 err1 <- liftM2 (/=) (unchanged c_a) (unchanged c_b) 161 when (err1) impossible 162 c_a' <- unsafeFreeze c_a 163 c_b' <- unsafeFreeze c_b 164 return (c_a', c_b')) 165 166 -- | set changes array for a and b and return number of changed lines 167 cmpseq :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray 168 -> BSTArray s -> BSTArray s -> Int -> Int -> Int -> Int -> ST s Int 169 cmpseq _ _ _ _ _ _ _ _ _ _ 0 0 = do return 0 170 cmpseq h_a h_b p_a p_b m_a m_b c_a c_b off_a off_b l_a l_b = do 171 let lim_a = off_a+l_a 172 lim_b = off_b+l_b 173 off_a' = findSnake h_a h_b p_a p_b m_a m_b off_a off_b l_a l_b off_a off_b 174 off_b' = off_b+off_a'-off_a 175 lim_a' = findSnakeRev h_a h_b p_a p_b m_a m_b lim_a lim_b off_a' off_b' 176 lim_b' = lim_b+lim_a'-lim_a 177 l_a' = lim_a'-off_a' 178 l_b' = lim_b'-off_b' 179 if l_a' == 0 || l_b' == 0 180 then if l_a' == 0 181 then do when (l_b' > 0) $ 182 mapM_ (\i -> writeArray c_b (m_b!i) True) 183 [(off_b' + 1) .. lim_b'] 184 return l_b' 185 else do when (l_a' > 0) $ 186 mapM_ (\i -> writeArray c_a (m_a!i) True) 187 [(off_a' + 1) .. lim_a'] 188 return l_a' 189 else do let m = l_a' + l_b' 190 del = l_a' - l_b' 191 dodd = odd $ del 192 v <- initV m 193 vrev <- initVRev m l_a' 194 writeArray vrev 0 l_a' 195 writeArray v 0 0 196 (xmid, ymid, _) <- findDiag 1 h_a h_b p_a p_b m_a m_b v vrev 197 off_a' off_b' l_a' l_b' del dodd 198 when ((xmid == 0 && ymid == 0) || (xmid == l_a' && ymid == l_b') 199 || (xmid < 0 || ymid < 0 || xmid > l_a' || ymid > l_b')) 200 impossible 201 c1 <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b 202 off_a' off_b' xmid ymid 203 c2 <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b 204 (off_a' + xmid) (off_b' + ymid) 205 (l_a' - xmid) (l_b' - ymid) 206 return $ c1 + c2 207 208 -- | return (xmid, ymid, cost) for the two substrings 209 -- a[off_a+1..off_a+1+l_a] and b 210 findDiag :: Int -> HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray 211 -> VSTArray s -> VSTArray s -> Int -> Int -> Int -> Int -> Int -> Bool 212 -> ST s (Int, Int, Int) 213 findDiag c h_a h_b p_a p_b m_a m_b v vrev off_a off_b l_a l_b del dodd = do 214 if c > l_a + l_b then error "findDiag failed" else return () 215 r <- findF 216 case r of 217 Just (xmid, ymid) -> return (xmid, ymid, (c*2 - 1)) 218 Nothing -> 219 do r' <- findR 220 case r' of 221 Just (xmid, ymid) -> return (xmid, ymid, c*2) 222 Nothing -> findDiag (c + 1) h_a h_b p_a p_b m_a m_b v vrev 223 off_a off_b l_a l_b del dodd 224 where fdmax = if c <= l_a then c else l_a - ((l_a + c) `mod` 2) 225 rdmax = if c <= l_b then c else l_b - ((l_b + c) `mod` 2) 226 lastrdmax = if (c-1) <= l_b then c-1 else l_b-((l_b + (c-1) `mod` 2)) 227 lastrdmin = -(if (c-1) <= l_a then c-1 else l_a-((l_a + (c-1)) `mod` 2)) 228 fdmin = -rdmax 229 rdmin = -fdmax 230 findF = findF' fdmax 231 findR = findR' rdmax 232 findF' d = do x <- findOne h_a h_b p_a p_b m_a m_b v d off_a off_b l_a l_b 233 if dodd && d - del >= lastrdmin && d - del <= lastrdmax 234 then do xr <- readArray vrev (d - del) 235 if xr <= x then return $ Just (x, x - d) 236 else if d <= fdmin then return Nothing 237 else findF' (d-2) 238 else if d <= fdmin then return Nothing else findF' (d-2) 239 findR' d = do x <- findOneRev h_a h_b p_a p_b m_a m_b vrev d del off_a off_b 240 if not dodd && (d + del >= fdmin) && (d + del <= fdmax) 241 then do xf <- readArray v (d + del) 242 if x <= xf then return $ Just (x,x-del-d) 243 else if d <= rdmin then return Nothing 244 else findR' (d-2) 245 else if d <= rdmin then return Nothing else findR' (d-2) 246 247 -- | find position on diag d with one more insert/delete going forward 248 findOne :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray 249 -> VSTArray s -> Int -> Int -> Int -> Int -> Int -> ST s Int 250 findOne h_a h_b p_a p_b m_a m_b v d off_a off_b l_a l_b = do 251 x0 <- do xbelow <- readArray v (d - 1) 252 xover <- readArray v (d + 1) 253 return $ if xover > xbelow then xover else xbelow + 1 254 let y0 = x0 - d 255 x = findSnake h_a h_b p_a p_b m_a m_b (x0+off_a) (y0+off_b) 256 l_a l_b off_a off_b 257 writeArray v d (x - off_a) 258 return (x-off_a) 259 260 -- | follow snake from northwest to southeast, x and y are absolute positions 261 findSnake :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray 262 -> Int -> Int -> Int -> Int -> Int -> Int -> Int 263 findSnake h_a h_b p_a p_b m_a m_b x y l_a l_b off_a off_b = 264 if x < l_a + off_a && y < l_b + off_b && h_a!(x+1) == h_b!(y+1) 265 && (h_a!(x+1) /= markColl || p_a!(m_a!(x+1)) == p_b!(m_b!(y+1))) 266 then findSnake h_a h_b p_a p_b m_a m_b (x + 1) (y + 1) l_a l_b off_a off_b 267 else x 268 269 -- | find position on diag d with one more insert/delete going backward 270 findOneRev :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray 271 -> VSTArray s -> Int -> Int -> Int -> Int -> ST s Int 272 findOneRev h_a h_b p_a p_b m_a m_b v d del off_a off_b = do 273 x0 <- do xbelow <- readArray v (d - 1) 274 xover <- readArray v (d + 1) 275 return $ if xbelow < xover then xbelow else xover-1 276 let y0 = x0 - del - d 277 x = findSnakeRev h_a h_b p_a p_b m_a m_b (x0+off_a) (y0+off_b) 278 off_a off_b 279 writeArray v d (x-off_a) 280 return (x-off_a) 281 282 -- | follow snake from southeast to northwest, x and y are absolute positions 283 findSnakeRev :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray 284 -> Int -> Int -> Int -> Int -> Int 285 findSnakeRev h_a h_b p_a p_b m_a m_b x y off_a off_b = 286 if x > off_a && y > off_b && h_a!x == h_b!y 287 && (h_a!x /= markColl || p_a!(m_a!x) == p_b!(m_b!y)) 288 then findSnakeRev h_a h_b p_a p_b m_a m_b (x - 1) (y - 1) off_a off_b 289 else x 290 291 -- | try to create nicer diffs by shifting around regions of changed lines 292 shiftBoundaries :: BSTArray s -> BSTArray s -> PArray -> Int -> Int -> ST s () 293 shiftBoundaries c_a c_b p_a i_ j_ = 294 do x <- nextChanged c_a i_ 295 case x of 296 Just start -> 297 do let skipped = start - i_ 298 j1 <- nextUnchangedN c_b skipped j_ 299 end <- nextUnchanged c_a start 300 j2 <- nextUnchanged c_b j1 301 (i3,j3) <- expand start end j2 302 shiftBoundaries c_a c_b p_a i3 j3 303 Nothing -> return () -- no change up to end of file 304 where noline = (aLen p_a) + 1 305 expand start i j = 306 do let len = i - start 307 (start0,i0,j0) <- shiftBackward start i j 308 b <- if j0 > 1 then readArray c_b (j0-1) else return False 309 let corr = if b then i0 else noline 310 let blank = if p_a!(i0-1) == B.empty then i0 311 else noline 312 (start1,i1,j1,corr1,blank1) <- shiftForward start0 i0 j0 corr blank 313 -- prefer corresponding to ending with blank line 314 let newi = if corr1 == noline then blank1 315 else corr1 316 (start2,i2,j2) <- moveCorr start1 i1 j1 newi 317 if len /= i2 - start2 318 then expand start2 i2 j2 319 else return (i2, j2) 320 shiftBackward start i j = 321 if start > 1 && p_a!(i-1) == p_a!(start-1) 322 then do when (i == start) impossible 323 b1 <- readArray c_a (i-1) 324 b2 <- readArray c_a (start-1) 325 when ((not b1) || b2) impossible 326 writeArray c_a (i-1) False 327 writeArray c_a (start-1) True 328 b <- if start > 2 then readArray c_a (start-2) 329 else return False 330 start' <- if b then liftM (1+) (prevUnchanged c_a (start-2)) 331 else return (start-1) 332 j' <- prevUnchanged c_b (j-1) 333 shiftBackward start' (i-1) j' 334 else do return (start,i,j) 335 shiftForward start i j corr blank = 336 if i <= aLen p_a && p_a!i == p_a!start && 337 -- B.empty at the end of file marks empty line after final newline 338 not ((i == aLen p_a) && (p_a!i == B.empty)) 339 then do when (i == start) impossible 340 b1 <- readArray c_a i 341 b2 <- readArray c_a start 342 when ((not b2) || b1) impossible 343 writeArray c_a i True 344 writeArray c_a start False 345 i0 <- nextUnchanged c_a (i+1) 346 j0 <- nextUnchanged c_b (j+1) 347 let corr0 = if i0 > (i+1) then noline 348 else if j0-j > 2 then i0 else corr 349 let blank0 = if i0 > i+1 then noline 350 else if p_a!(i0-1) == B.empty then i0 351 else blank 352 shiftForward (start+1) i0 j0 corr0 blank0 353 else do return (start,i,j,corr,blank) 354 moveCorr start i j corr = 355 if corr >= i 356 then return (start,i,j) 357 else do b1 <- readArray c_a (i-1) 358 b2 <- readArray c_a (start-1) 359 when ((not b1) || b2) impossible 360 when (p_a!(i-1) /= p_a!(start-1)) impossible 361 writeArray c_a (i-1) False 362 writeArray c_a (start-1) True 363 j' <- prevUnchanged c_b (j-1) 364 moveCorr (start-1) (i-1) j' corr 365 366 -- | goto next unchanged line, return the given line if unchanged 367 nextUnchanged :: BSTArray s -> Int -> ST s Int 368 nextUnchanged c i = do 369 len <- aLenM c 370 if i == len + 1 then return i 371 else do b <- readArray c i 372 if b then nextUnchanged c (i+1) 373 else return i 374 375 -- | skip at least one unchanged line, if there is none advance 376 -- behind the last line 377 skipOneUnChanged :: BSTArray s -> Int -> ST s Int 378 skipOneUnChanged c i = do 379 len <- aLenM c 380 if i == len + 1 381 then return i 382 else do b <- readArray c i 383 if not b then return (i+1) 384 else skipOneUnChanged c (i+1) 385 386 -- | goto n-th next unchanged line 387 nextUnchangedN :: BSTArray s -> Int -> Int -> ST s Int 388 nextUnchangedN c n i = do 389 if n == 0 then return i 390 else do i' <- skipOneUnChanged c i 391 nextUnchangedN c (n-1) i' 392 393 -- | goto next changed line, return the given line if changed 394 nextChanged :: BSTArray s -> Int -> ST s (Maybe Int) 395 nextChanged c i = do 396 len <- aLenM c 397 if i <= len 398 then do b <- readArray c i 399 if not b then nextChanged c (i+1) 400 else return $ Just i 401 else return Nothing 402 403 -- | goto previous unchanged line, return the given line if unchanged 404 prevUnchanged :: BSTArray s -> Int -> ST s Int 405 prevUnchanged c i = do 406 b <- readArray c i 407 if b then prevUnchanged c (i-1) 408 else return i 409 410 type HArray = UArray Int Int32 411 type BArray = UArray Int Bool 412 type PArray = Array Int B.ByteString 413 type MapArray = UArray Int Int 414 type VSTArray s = STUArray s Int Int 415 type BSTArray s = STUArray s Int Bool 416 417 initV :: Int -> ST s (VSTArray s) 418 initV dmax = do 419 newArray (-(dmax + 1), dmax + 1) (-1) 420 421 initVRev :: Int -> Int -> ST s (VSTArray s) 422 initVRev dmax xmax = do 423 newArray (-(dmax + 1), dmax + 1) (xmax + 1) 424 425 -- 1 indexed, v[0] is used as a guard element 426 initVChanged :: Int -> ST s (BSTArray s) 427 initVChanged l = do 428 a <- newArray (0, l) True 429 writeArray a 0 False 430 return a 431 -- set to false for all lines which have a mapping later 432 -- other lines are only present in one of the files 433 434 initH :: [Int32] -> HArray 435 initH a = listArray (0, length a) (0:a) 436 437 initM :: [Int] -> MapArray 438 initM a = listArray (0, length a) (0:a) 439 440 initP :: [B.ByteString] -> PArray 441 initP a = listArray (0, length a) (B.empty:a) 442 443 aLen :: (IArray a e) => a Int e -> Int 444 aLen a = snd $ bounds a 445 aLenM :: (MArray a e m) => a Int e -> m Int 446 aLenM a = getBounds a >>= return . snd 447 448 convertPatch :: Int -> PArray -> PArray -> (Int, Int, Int, Int) 449 -> (Int,[B.ByteString],[B.ByteString]) 450 convertPatch off a b (a0,a1,b0,b1) 451 | b0 == b1 = (b0+off,getDelete a a0 a1,[]) 452 | a0 == a1 = (b0+off,[],getInsert b b0 b1) 453 | otherwise = (b0+off,getDelete a a0 a1,getInsert b b0 b1) 454 455 getInsert :: PArray -> Int -> Int -> [B.ByteString] 456 getInsert b from to 457 | from >= to = [] 458 | otherwise = (b!(from+1)):(getInsert b (from+1) to) 459 getDelete :: PArray -> Int -> Int -> [B.ByteString] 460 getDelete a from to 461 | from >= to = [] 462 | otherwise = (a!(from+1)):(getDelete a (from+1) to) 463 464 createPatch :: BArray -> BArray -> [(Int, Int, Int, Int)] 465 createPatch c_a c_b = 466 reverse $ createP c_a c_b (aLen c_a) (aLen c_b) 467 468 createP :: BArray -> BArray -> Int -> Int -> [(Int, Int, Int, Int)] 469 createP _ _ 0 0 = [] 470 createP c_a c_b ia ib = 471 if c_a!ia || c_b!ib 472 then let ia' = skipChangedRev c_a ia 473 ib' = skipChangedRev c_b ib 474 in (ia',ia,ib',ib):(createP c_a c_b ia' ib') 475 else createP c_a c_b (ia-1) (ib-1) 476 477 skipChangedRev :: BArray -> Int -> Int 478 skipChangedRev c i = if i >= 0 && c!i then skipChangedRev c (i-1) else i