1 {-# LANGUAGE BangPatterns, ForeignFunctionInterface, CPP, ScopedTypeVariables #-} 2 3 ----------------------------------------------------------------------------- 4 -- | 5 -- Module : ByteStringUtils 6 -- Copyright : (c) The University of Glasgow 2001, 7 -- David Roundy 2003-2005 8 -- License : GPL (I'm happy to also license this file BSD style but don't 9 -- want to bother distributing two license files with darcs. 10 -- 11 -- Maintainer : droundy@abridgegame.org 12 -- Stability : experimental 13 -- Portability : portable 14 -- 15 -- GZIp and MMap IO for ByteStrings, and miscellaneous functions for Data.ByteString 16 -- 17 18 module ByteStringUtils ( 19 20 unsafeWithInternals, 21 unpackPSfromUTF8, 22 23 -- IO with mmap or gzip 24 gzReadFilePS, 25 mmapFilePS, 26 gzWriteFilePS, 27 gzWriteFilePSs, 28 29 -- gzip handling 30 isGZFile, 31 gzDecompress, 32 33 -- list utilities 34 ifHeadThenTail, 35 dropSpace, 36 breakSpace, 37 linesPS, 38 unlinesPS, 39 hashPS, 40 breakFirstPS, 41 breakLastPS, 42 substrPS, 43 readIntPS, 44 is_funky, 45 fromHex2PS, 46 fromPS2Hex, 47 betweenLinesPS, 48 break_after_nth_newline, 49 break_before_nth_newline, 50 intercalate 51 ) where 52 53 import Prelude hiding ( catch ) 54 import qualified Data.ByteString as B 55 import qualified Data.ByteString.Char8 as BC 56 import qualified Data.ByteString.Internal as BI 57 import Data.ByteString (intercalate, uncons) 58 import Data.ByteString.Internal (fromForeignPtr) 59 60 #if defined (HAVE_MMAP) 61 import Control.Exception ( catch ) 62 #endif 63 import System.IO 64 import System.IO.Unsafe ( unsafePerformIO ) 65 66 import Foreign.Storable ( peekElemOff, peek ) 67 import Foreign.Marshal.Alloc ( free ) 68 import Foreign.Marshal.Array ( mallocArray, peekArray, advancePtr ) 69 import Foreign.C.Types ( CInt ) 70 71 import Data.Bits ( rotateL ) 72 import Data.Char ( chr, ord, isSpace ) 73 import Data.Word ( Word8 ) 74 import Data.Int ( Int32 ) 75 import Control.Monad ( when ) 76 77 import Foreign.Ptr ( plusPtr, Ptr ) 78 import Foreign.ForeignPtr ( withForeignPtr ) 79 80 #ifdef DEBUG_PS 81 import Foreign.ForeignPtr ( addForeignPtrFinalizer ) 82 import Foreign.Ptr ( FunPtr ) 83 #endif 84 85 import qualified Data.ByteString.Lazy as BL 86 import qualified Codec.Compression.GZip as GZ 87 import qualified Codec.Compression.Zlib.Internal as ZI 88 import Darcs.Global ( addCRCWarning ) 89 90 #ifdef HAVE_MMAP 91 import System.IO.MMap( mmapFileByteString ) 92 import System.Mem( performGC ) 93 import System.Posix.Files( fileSize, getSymbolicLinkStatus ) 94 #endif 95 96 -- ----------------------------------------------------------------------------- 97 -- obsolete debugging code 98 99 -- ----------------------------------------------------------------------------- 100 -- unsafeWithInternals 101 102 -- | Do something with the internals of a PackedString. Beware of 103 -- altering the contents! 104 unsafeWithInternals :: B.ByteString -> (Ptr Word8 -> Int -> IO a) -> IO a 105 unsafeWithInternals ps f 106 = case BI.toForeignPtr ps of 107 (fp,s,l) -> withForeignPtr fp $ \p -> f (p `plusPtr` s) l 108 109 -- | readIntPS skips any whitespace at the beginning of its argument, and 110 -- reads an Int from the beginning of the PackedString. If there is no 111 -- integer at the beginning of the string, it returns Nothing, otherwise it 112 -- just returns the int read, along with a B.ByteString containing the 113 -- remainder of its input. 114 115 readIntPS :: B.ByteString -> Maybe (Int, B.ByteString) 116 readIntPS = BC.readInt . BC.dropWhile isSpace 117 118 -- ----------------------------------------------------------------------------- 119 -- Destructor functions (taking PackedStrings apart) 120 121 unpackPSfromUTF8 :: B.ByteString -> String 122 unpackPSfromUTF8 ps = 123 case BI.toForeignPtr ps of 124 (_,_, 0) -> "" 125 (x,s,l) -> 126 unsafePerformIO $ withForeignPtr x $ \p-> 127 do outbuf <- mallocArray l 128 lout <- fromIntegral `fmap` 129 utf8_to_ints outbuf (p `plusPtr` s) (fromIntegral l) 130 when (lout < 0) $ error "Bad UTF8!" 131 str <- (map (chr . fromIntegral)) `fmap` peekArray lout outbuf 132 free outbuf 133 return str 134 135 foreign import ccall unsafe "static fpstring.h utf8_to_ints" utf8_to_ints 136 :: Ptr Int -> Ptr Word8 -> CInt -> IO CInt 137 138 -- ----------------------------------------------------------------------------- 139 -- List-mimicking functions for PackedStrings 140 141 {-# INLINE ifHeadThenTail #-} 142 ifHeadThenTail :: Word8 -> B.ByteString -> Maybe B.ByteString 143 ifHeadThenTail c s = case uncons s of 144 Just (w, t) | w == c -> Just t 145 _ -> Nothing 146 147 ------------------------------------------------------------------------ 148 -- A reimplementation of Data.ByteString.Char8.dropSpace, but 149 -- specialised to darcs' need for a 4 way isspace. 150 -- 151 -- TODO: if it is safe to use the expanded definition of isSpaceWord8 152 -- provided by Data.ByteString.Char8, then all this can go. 153 154 -- A locale-independent isspace(3) so patches are interpreted the same everywhere. 155 -- ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r') 156 isSpaceWord8 :: Word8 -> Bool 157 isSpaceWord8 w = 158 w == 0x20 || -- ' ' 159 w == 0x09 || -- '\t' 160 w == 0x0A || -- '\n' 161 w == 0x0D -- '\r' 162 {-# INLINE isSpaceWord8 #-} 163 164 firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int 165 firstnonspace !ptr !n !m 166 | n >= m = return n 167 | otherwise = do w <- peekElemOff ptr n 168 if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n 169 170 firstspace :: Ptr Word8 -> Int -> Int -> IO Int 171 firstspace !ptr !n !m 172 | n >= m = return n 173 | otherwise = do w <- peekElemOff ptr n 174 if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n 175 176 -- | 'dropSpace' efficiently returns the 'ByteString' argument with 177 -- white space Chars removed from the front. It is more efficient than 178 -- calling dropWhile for removing whitespace. I.e. 179 -- 180 -- > dropWhile isSpace == dropSpace 181 -- 182 dropSpace :: B.ByteString -> B.ByteString 183 dropSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do 184 i <- firstnonspace (p `plusPtr` s) 0 l 185 return $! if i == l then B.empty else BI.PS x (s+i) (l-i) 186 {-# INLINE dropSpace #-} 187 188 -- | 'breakSpace' returns the pair of ByteStrings when the argument is 189 -- broken at the first whitespace byte. I.e. 190 -- 191 -- > break isSpace == breakSpace 192 -- 193 breakSpace :: B.ByteString -> (B.ByteString,B.ByteString) 194 breakSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do 195 i <- firstspace (p `plusPtr` s) 0 l 196 return $! case () of {_ 197 | i == 0 -> (B.empty, BI.PS x s l) 198 | i == l -> (BI.PS x s l, B.empty) 199 | otherwise -> (BI.PS x s i, BI.PS x (s+i) (l-i)) 200 } 201 {-# INLINE breakSpace #-} 202 203 ------------------------------------------------------------------------ 204 205 {-# INLINE is_funky #-} 206 is_funky :: B.ByteString -> Bool 207 is_funky ps = case BI.toForeignPtr ps of 208 (x,s,l) -> 209 unsafePerformIO $ withForeignPtr x $ \p-> 210 (/=0) `fmap` has_funky_char (p `plusPtr` s) (fromIntegral l) 211 212 foreign import ccall unsafe "fpstring.h has_funky_char" has_funky_char 213 :: Ptr Word8 -> CInt -> IO CInt 214 215 ------------------------------------------------------------------------ 216 217 -- ByteString rewrites break (=='x') to breakByte 'x' 218 -- break ((==) x) = breakChar x 219 -- break (==x) = breakChar x 220 -- 221 222 {- 223 {-# INLINE breakOnPS #-} 224 breakOnPS :: Char -> B.ByteString -> (B.ByteString, B.ByteString) 225 breakOnPS c p = case BC.elemIndex c p of 226 Nothing -> (p, BC.empty) 227 Just n -> (B.take n p, B.drop n p) 228 -} 229 230 {-# INLINE hashPS #-} 231 hashPS :: B.ByteString -> Int32 232 hashPS ps = 233 case BI.toForeignPtr ps of 234 (x,s,l) -> 235 unsafePerformIO $ withForeignPtr x $ \p-> 236 do hash (p `plusPtr` s) l 237 238 hash :: Ptr Word8 -> Int -> IO Int32 239 hash ptr len = f (0 :: Int32) ptr len 240 where f h _ 0 = return h 241 f h p n = do x <- peek p 242 let !h' = (fromIntegral x) + (rotateL h 8) 243 f h' (p `advancePtr` 1) (n-1) 244 245 {-# INLINE substrPS #-} 246 substrPS :: B.ByteString -> B.ByteString -> Maybe Int 247 substrPS tok str 248 | B.null tok = Just 0 249 | B.length tok > B.length str = Nothing 250 | otherwise = do n <- BC.elemIndex (BC.head tok) str 251 let ttok = B.tail tok 252 reststr = B.drop (n+1) str 253 if ttok == B.take (B.length ttok) reststr 254 then Just n 255 else ((n+1)+) `fmap` substrPS tok reststr 256 257 ------------------------------------------------------------------------ 258 259 -- TODO: replace breakFirstPS and breakLastPS with definitions based on 260 -- ByteString's break/breakEnd 261 {-# INLINE breakFirstPS #-} 262 breakFirstPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString) 263 breakFirstPS c p = case BC.elemIndex c p of 264 Nothing -> Nothing 265 Just n -> Just (B.take n p, B.drop (n+1) p) 266 267 {-# INLINE breakLastPS #-} 268 breakLastPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString) 269 breakLastPS c p = case BC.elemIndexEnd c p of 270 Nothing -> Nothing 271 Just n -> Just (B.take n p, B.drop (n+1) p) 272 273 -- TODO: rename 274 {-# INLINE linesPS #-} 275 linesPS :: B.ByteString -> [B.ByteString] 276 linesPS ps 277 | B.null ps = [B.empty] 278 | otherwise = BC.split '\n' ps 279 280 {- QuickCheck property: 281 282 import Test.QuickCheck 283 import qualified Data.ByteString.Char8 as BC 284 import Data.Char 285 instance Arbitrary BC.ByteString where 286 arbitrary = fmap BC.pack arbitrary 287 instance Arbitrary Char where 288 arbitrary = chr `fmap` choose (32,127) 289 deepCheck = check (defaultConfig { configMaxTest = 10000}) 290 testLines = deepCheck (\x -> (linesPS x == linesPSOld x)) 291 linesPSOld ps = case BC.elemIndex '\n' ps of 292 Nothing -> [ps] 293 Just n -> B.take n ps : linesPS (B.drop (n+1) ps) -} 294 295 {-| This function acts exactly like the "Prelude" unlines function, or like 296 "Data.ByteString.Char8" 'unlines', but with one important difference: it will 297 produce a string which may not end with a newline! That is: 298 299 > unlinesPS ["foo", "bar"] 300 301 evaluates to \"foo\\nbar\", not \"foo\\nbar\\n\"! This point should hold true for 302 'linesPS' as well. 303 304 TODO: rename this function. -} 305 unlinesPS :: [B.ByteString] -> B.ByteString 306 unlinesPS [] = BC.empty 307 unlinesPS x = BC.init $ BC.unlines x 308 {-# INLINE unlinesPS #-} 309 {- QuickCheck property: 310 311 testUnlines = deepCheck (\x -> (unlinesPS x == unlinesPSOld x)) 312 unlinesPSOld ss = BC.concat $ intersperse_newlines ss 313 where intersperse_newlines (a:b:s) = a : newline : intersperse_newlines (b:s) 314 intersperse_newlines s = s 315 newline = BC.pack "\n" -} 316 317 -- ----------------------------------------------------------------------------- 318 -- gzReadFilePS 319 320 -- |Decompress the given bytestring into a lazy list of chunks, along with a boolean 321 -- flag indicating (if True) that the CRC was corrupted. 322 -- Inspecting the flag will cause the entire list of chunks to be evaluated (but if 323 -- you throw away the list immediately this should run in constant space). 324 gzDecompress :: Maybe Int -> BL.ByteString -> ([B.ByteString], Bool) 325 gzDecompress mbufsize = 326 -- This is what the code would be without the bad CRC recovery logic: 327 -- return . BL.toChunks . GZ.decompressWith decompressParams 328 toListWarn . ZI.decompressWithErrors ZI.GZip decompressParams 329 where 330 decompressParams = case mbufsize of 331 Just bufsize -> GZ.defaultDecompressParams { GZ.decompressBufferSize = bufsize } 332 Nothing -> GZ.defaultDecompressParams 333 334 toListWarn :: ZI.DecompressStream -> ([B.ByteString], Bool) 335 toListWarn = foldDecompressStream (\x ~(xs, b) -> (x:xs, b)) ([], False) handleBad 336 337 -- cut and paste from Zlib since it's not currently exported (interface not yet certain) 338 foldDecompressStream :: (B.ByteString -> a -> a) -> a 339 -> (ZI.DecompressError -> String -> a) 340 -> ZI.DecompressStream -> a 341 foldDecompressStream chunk end err = fold 342 where 343 fold ZI.StreamEnd = end 344 fold (ZI.StreamChunk bs stream) = chunk bs (fold stream) 345 fold (ZI.StreamError code msg) = err code msg 346 347 -- For a while a bug in darcs caused gzip files with good data but bad CRCs to be 348 -- produced. Trap bad CRC messages, run the specified action to report that it happened, 349 -- but continue on the assumption that the data is valid. 350 handleBad ZI.DataError "incorrect data check" = ([], True) 351 handleBad _ msg = error msg 352 353 isGZFile :: FilePath -> IO (Maybe Int) 354 isGZFile f = do 355 h <- openBinaryFile f ReadMode 356 header <- B.hGet h 2 357 if header /= BC.pack "\31\139" 358 then do hClose h 359 return Nothing 360 else do hSeek h SeekFromEnd (-4) 361 len <- hGetLittleEndInt h 362 hClose h 363 return (Just len) 364 365 -- | Read an entire file, which may or may not be gzip compressed, directly 366 -- into a 'B.ByteString'. 367 gzReadFilePS :: FilePath -> IO B.ByteString 368 gzReadFilePS f = do 369 mlen <- isGZFile f 370 case mlen of 371 Nothing -> mmapFilePS f 372 Just len -> 373 do -- Passing the length to gzDecompress means that it produces produces one chunk, 374 -- which in turn means that B.concat won't need to copy data. 375 -- If the length is wrong this will just affect efficiency, not correctness 376 let doDecompress buf = let (res, bad) = gzDecompress (Just len) buf 377 in do when bad $ addCRCWarning f 378 return res 379 compressed <- (BL.fromChunks . return) `fmap` mmapFilePS f 380 B.concat `fmap` doDecompress compressed 381 382 hGetLittleEndInt :: Handle -> IO Int 383 hGetLittleEndInt h = do 384 b1 <- ord `fmap` hGetChar h 385 b2 <- ord `fmap` hGetChar h 386 b3 <- ord `fmap` hGetChar h 387 b4 <- ord `fmap` hGetChar h 388 return $ b1 + 256*b2 + 65536*b3 + 16777216*b4 389 390 gzWriteFilePS :: FilePath -> B.ByteString -> IO () 391 gzWriteFilePS f ps = gzWriteFilePSs f [ps] 392 393 gzWriteFilePSs :: FilePath -> [B.ByteString] -> IO () 394 gzWriteFilePSs f pss = 395 BL.writeFile f $ GZ.compress $ BL.fromChunks pss 396 397 -- ----------------------------------------------------------------------------- 398 -- mmapFilePS 399 400 -- | Like readFilePS, this reads an entire file directly into a 401 -- 'B.ByteString', but it is even more efficient. It involves directly 402 -- mapping the file to memory. This has the advantage that the contents of 403 -- the file never need to be copied. Also, under memory pressure the page 404 -- may simply be discarded, wile in the case of readFilePS it would need to 405 -- be written to swap. If you read many small files, mmapFilePS will be 406 -- less memory-efficient than readFilePS, since each mmapFilePS takes up a 407 -- separate page of memory. Also, you can run into bus errors if the file 408 -- is modified. NOTE: as with 'readFilePS', the string representation in 409 -- the file is assumed to be ISO-8859-1. 410 411 mmapFilePS :: FilePath -> IO B.ByteString 412 #ifdef HAVE_MMAP 413 mmapFilePS f = do 414 x <- mmapFileByteString f Nothing 415 `catch` (\_ -> do 416 size <- fileSize `fmap` getSymbolicLinkStatus f 417 if size == 0 418 then return B.empty 419 else performGC >> mmapFileByteString f Nothing) 420 return x 421 #else 422 mmapFilePS = B.readFile 423 #endif 424 425 -- ------------------------------------------------------------------------- 426 -- fromPS2Hex 427 428 foreign import ccall unsafe "static fpstring.h conv_to_hex" conv_to_hex 429 :: Ptr Word8 -> Ptr Word8 -> CInt -> IO () 430 431 fromPS2Hex :: B.ByteString -> B.ByteString 432 fromPS2Hex ps = case BI.toForeignPtr ps of 433 (x,s,l) -> 434 BI.unsafeCreate (2*l) $ \p -> withForeignPtr x $ \f -> 435 conv_to_hex p (f `plusPtr` s) $ fromIntegral l 436 437 -- ------------------------------------------------------------------------- 438 -- fromHex2PS 439 440 foreign import ccall unsafe "static fpstring.h conv_from_hex" conv_from_hex 441 :: Ptr Word8 -> Ptr Word8 -> CInt -> IO () 442 443 fromHex2PS :: B.ByteString -> B.ByteString 444 fromHex2PS ps = case BI.toForeignPtr ps of 445 (x,s,l) -> 446 BI.unsafeCreate (l `div` 2) $ \p -> withForeignPtr x $ \f -> 447 conv_from_hex p (f `plusPtr` s) (fromIntegral $ l `div` 2) 448 449 -- ------------------------------------------------------------------------- 450 -- betweenLinesPS 451 452 -- | betweenLinesPS returns the B.ByteString between the two lines given, 453 -- or Nothing if they do not appear. 454 455 betweenLinesPS :: B.ByteString -> B.ByteString -> B.ByteString 456 -> Maybe (B.ByteString) 457 betweenLinesPS start end ps 458 = case break (start ==) (linesPS ps) of 459 (_, _:rest@(bs1:_)) -> 460 case BI.toForeignPtr bs1 of 461 (ps1,s1,_) -> 462 case break (end ==) rest of 463 (_, bs2:_) -> case BI.toForeignPtr bs2 of (_,s2,_) -> Just $ fromForeignPtr ps1 s1 (s2 - s1) 464 _ -> Nothing 465 _ -> Nothing 466 467 -- ------------------------------------------------------------------------- 468 -- break_after_nth_newline 469 470 break_after_nth_newline :: Int -> B.ByteString 471 -> Maybe (B.ByteString, B.ByteString) 472 break_after_nth_newline 0 the_ps | B.null the_ps = Just (B.empty, B.empty) 473 break_after_nth_newline n the_ps = 474 case BI.toForeignPtr the_ps of 475 (fp,the_s,l) -> 476 unsafePerformIO $ withForeignPtr fp $ \p -> 477 do let findit 0 s | s == end = return $ Just (the_ps, B.empty) 478 findit _ s | s == end = return Nothing 479 findit 0 s = let left_l = s - the_s 480 in return $ Just (fromForeignPtr fp the_s left_l, 481 fromForeignPtr fp s (l - left_l)) 482 findit i s = do w <- peekElemOff p s 483 if w == nl then findit (i-1) (s+1) 484 else findit i (s+1) 485 nl = BI.c2w '\n' 486 end = the_s + l 487 findit n the_s 488 489 -- ------------------------------------------------------------------------- 490 -- break_before_nth_newline 491 492 break_before_nth_newline :: Int -> B.ByteString -> (B.ByteString, B.ByteString) 493 break_before_nth_newline 0 the_ps 494 | B.null the_ps = (B.empty, B.empty) 495 break_before_nth_newline n the_ps = 496 case BI.toForeignPtr the_ps of 497 (fp,the_s,l) -> 498 unsafePerformIO $ withForeignPtr fp $ \p -> 499 do let findit _ s | s == end = return (the_ps, B.empty) 500 findit i s = do w <- peekElemOff p s 501 if w == nl 502 then if i == 0 503 then let left_l = s - the_s 504 in return (fromForeignPtr fp the_s left_l, 505 fromForeignPtr fp s (l - left_l)) 506 else findit (i-1) (s+1) 507 else findit i (s+1) 508 nl = BI.c2w '\n' 509 end = the_s + l 510 findit n the_s