1 {-# LANGUAGE BangPatterns, ForeignFunctionInterface, CPP, ScopedTypeVariables #-}
    2 
    3 -----------------------------------------------------------------------------
    4 -- |
    5 -- Module      :  ByteStringUtils
    6 -- Copyright   :  (c) The University of Glasgow 2001,
    7 --                    David Roundy 2003-2005
    8 -- License : GPL (I'm happy to also license this file BSD style but don't
    9 --           want to bother distributing two license files with darcs.
   10 --
   11 -- Maintainer  :  droundy@abridgegame.org
   12 -- Stability   :  experimental
   13 -- Portability :  portable
   14 --
   15 -- GZIp and MMap IO for ByteStrings, and miscellaneous functions for Data.ByteString
   16 --
   17 
   18 module ByteStringUtils (
   19 
   20         unsafeWithInternals,
   21         unpackPSfromUTF8,
   22 
   23         -- IO with mmap or gzip
   24         gzReadFilePS,
   25         mmapFilePS,
   26         gzWriteFilePS,
   27         gzWriteFilePSs,
   28 
   29         -- gzip handling
   30         isGZFile,
   31         gzDecompress,
   32 
   33         -- list utilities
   34         ifHeadThenTail,
   35         dropSpace,
   36         breakSpace,
   37         linesPS,
   38         unlinesPS,
   39         hashPS,
   40         breakFirstPS,
   41         breakLastPS,
   42         substrPS,
   43         readIntPS,
   44         is_funky,
   45         fromHex2PS,
   46         fromPS2Hex,
   47         betweenLinesPS,
   48         break_after_nth_newline,
   49         break_before_nth_newline,
   50         intercalate
   51     ) where
   52 
   53 import Prelude hiding ( catch )
   54 import qualified Data.ByteString            as B
   55 import qualified Data.ByteString.Char8      as BC
   56 import qualified Data.ByteString.Internal   as BI
   57 import Data.ByteString (intercalate, uncons)
   58 import Data.ByteString.Internal (fromForeignPtr)
   59 
   60 #if defined (HAVE_MMAP)
   61 import Control.Exception        ( catch )
   62 #endif
   63 import System.IO
   64 import System.IO.Unsafe         ( unsafePerformIO )
   65 
   66 import Foreign.Storable         ( peekElemOff, peek )
   67 import Foreign.Marshal.Alloc    ( free )
   68 import Foreign.Marshal.Array    ( mallocArray, peekArray, advancePtr )
   69 import Foreign.C.Types          ( CInt )
   70 
   71 import Data.Bits                ( rotateL )
   72 import Data.Char                ( chr, ord, isSpace )
   73 import Data.Word                ( Word8 )
   74 import Data.Int                 ( Int32 )
   75 import Control.Monad            ( when )
   76 
   77 import Foreign.Ptr              ( plusPtr, Ptr )
   78 import Foreign.ForeignPtr       ( withForeignPtr )
   79 
   80 #ifdef DEBUG_PS
   81 import Foreign.ForeignPtr       ( addForeignPtrFinalizer )
   82 import Foreign.Ptr              ( FunPtr )
   83 #endif
   84 
   85 import qualified Data.ByteString.Lazy as BL
   86 import qualified Codec.Compression.GZip as GZ
   87 import qualified Codec.Compression.Zlib.Internal as ZI
   88 import Darcs.Global ( addCRCWarning )
   89 
   90 #ifdef HAVE_MMAP
   91 import System.IO.MMap( mmapFileByteString )
   92 import System.Mem( performGC )
   93 import System.Posix.Files( fileSize, getSymbolicLinkStatus )
   94 #endif
   95 
   96 -- -----------------------------------------------------------------------------
   97 -- obsolete debugging code
   98 
   99 -- -----------------------------------------------------------------------------
  100 -- unsafeWithInternals
  101 
  102 -- | Do something with the internals of a PackedString. Beware of
  103 -- altering the contents!
  104 unsafeWithInternals :: B.ByteString -> (Ptr Word8 -> Int -> IO a) -> IO a
  105 unsafeWithInternals ps f
  106  = case BI.toForeignPtr ps of
  107    (fp,s,l) -> withForeignPtr fp $ \p -> f (p `plusPtr` s) l
  108 
  109 -- | readIntPS skips any whitespace at the beginning of its argument, and
  110 -- reads an Int from the beginning of the PackedString.  If there is no
  111 -- integer at the beginning of the string, it returns Nothing, otherwise it
  112 -- just returns the int read, along with a B.ByteString containing the
  113 -- remainder of its input.
  114 
  115 readIntPS :: B.ByteString -> Maybe (Int, B.ByteString)
  116 readIntPS = BC.readInt . BC.dropWhile isSpace
  117 
  118 -- -----------------------------------------------------------------------------
  119 -- Destructor functions (taking PackedStrings apart)
  120 
  121 unpackPSfromUTF8 :: B.ByteString -> String
  122 unpackPSfromUTF8 ps =
  123  case BI.toForeignPtr ps of
  124    (_,_, 0) -> ""
  125    (x,s,l)  ->
  126     unsafePerformIO $ withForeignPtr x $ \p->
  127     do outbuf <- mallocArray l
  128        lout <- fromIntegral `fmap`
  129                utf8_to_ints outbuf (p `plusPtr` s) (fromIntegral l)
  130        when (lout < 0) $ error "Bad UTF8!"
  131        str <- (map (chr . fromIntegral)) `fmap` peekArray lout outbuf
  132        free outbuf
  133        return str
  134 
  135 foreign import ccall unsafe "static fpstring.h utf8_to_ints" utf8_to_ints
  136     :: Ptr Int -> Ptr Word8 -> CInt -> IO CInt
  137 
  138 -- -----------------------------------------------------------------------------
  139 -- List-mimicking functions for PackedStrings
  140 
  141 {-# INLINE ifHeadThenTail #-}
  142 ifHeadThenTail :: Word8 -> B.ByteString -> Maybe B.ByteString
  143 ifHeadThenTail c s = case uncons s of
  144     Just (w, t) | w == c    -> Just t
  145     _                       -> Nothing
  146 
  147 ------------------------------------------------------------------------
  148 -- A reimplementation of Data.ByteString.Char8.dropSpace, but
  149 -- specialised to darcs' need for a 4 way isspace.
  150 --
  151 -- TODO: if it is safe to use the expanded definition of isSpaceWord8
  152 -- provided by Data.ByteString.Char8, then all this can go.
  153 
  154 -- A locale-independent isspace(3) so patches are interpreted the same everywhere.
  155 -- ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r')
  156 isSpaceWord8 :: Word8 -> Bool
  157 isSpaceWord8 w =
  158     w == 0x20 ||    -- ' '
  159     w == 0x09 ||    -- '\t'
  160     w == 0x0A ||    -- '\n'
  161     w == 0x0D       -- '\r'
  162 {-# INLINE isSpaceWord8 #-}
  163 
  164 firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int
  165 firstnonspace !ptr !n !m
  166     | n >= m    = return n
  167     | otherwise = do w <- peekElemOff ptr n
  168                      if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n
  169 
  170 firstspace :: Ptr Word8 -> Int -> Int -> IO Int
  171 firstspace !ptr !n !m
  172     | n >= m    = return n
  173     | otherwise = do w <- peekElemOff ptr n
  174                      if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n
  175 
  176 -- | 'dropSpace' efficiently returns the 'ByteString' argument with
  177 -- white space Chars removed from the front. It is more efficient than
  178 -- calling dropWhile for removing whitespace. I.e.
  179 -- 
  180 -- > dropWhile isSpace == dropSpace
  181 --
  182 dropSpace :: B.ByteString -> B.ByteString
  183 dropSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do
  184     i <- firstnonspace (p `plusPtr` s) 0 l
  185     return $! if i == l then B.empty else BI.PS x (s+i) (l-i)
  186 {-# INLINE dropSpace #-}
  187 
  188 -- | 'breakSpace' returns the pair of ByteStrings when the argument is
  189 -- broken at the first whitespace byte. I.e.
  190 -- 
  191 -- > break isSpace == breakSpace
  192 --
  193 breakSpace :: B.ByteString -> (B.ByteString,B.ByteString)
  194 breakSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do
  195     i <- firstspace (p `plusPtr` s) 0 l
  196     return $! case () of {_
  197         | i == 0    -> (B.empty, BI.PS x s l)
  198         | i == l    -> (BI.PS x s l, B.empty)
  199         | otherwise -> (BI.PS x s i, BI.PS x (s+i) (l-i))
  200     }
  201 {-# INLINE breakSpace #-}
  202 
  203 ------------------------------------------------------------------------
  204 
  205 {-# INLINE is_funky #-}
  206 is_funky :: B.ByteString -> Bool
  207 is_funky ps = case BI.toForeignPtr ps of
  208    (x,s,l) ->
  209     unsafePerformIO $ withForeignPtr x $ \p->
  210     (/=0) `fmap` has_funky_char (p `plusPtr` s) (fromIntegral l)
  211 
  212 foreign import ccall unsafe "fpstring.h has_funky_char" has_funky_char
  213     :: Ptr Word8 -> CInt -> IO CInt
  214 
  215 ------------------------------------------------------------------------
  216 
  217 -- ByteString rewrites break (=='x') to breakByte 'x'
  218 --  break ((==) x) = breakChar x
  219 --  break (==x) = breakChar x
  220 --
  221 
  222 {-
  223 {-# INLINE breakOnPS #-}
  224 breakOnPS :: Char -> B.ByteString -> (B.ByteString, B.ByteString)
  225 breakOnPS c p = case BC.elemIndex c p of
  226                 Nothing -> (p, BC.empty)
  227                 Just n  -> (B.take n p, B.drop n p)
  228 -}
  229 
  230 {-# INLINE hashPS #-}
  231 hashPS :: B.ByteString -> Int32
  232 hashPS ps =
  233    case BI.toForeignPtr ps of
  234    (x,s,l) ->
  235     unsafePerformIO $ withForeignPtr x $ \p->
  236     do hash (p `plusPtr` s) l
  237 
  238 hash :: Ptr Word8 -> Int -> IO Int32
  239 hash ptr len = f (0 :: Int32) ptr len
  240  where f h _ 0 = return h
  241        f h p n = do x <- peek p
  242                     let !h' =  (fromIntegral x) + (rotateL h 8)
  243                     f h' (p `advancePtr` 1) (n-1)
  244 
  245 {-# INLINE substrPS #-}
  246 substrPS :: B.ByteString -> B.ByteString -> Maybe Int
  247 substrPS tok str
  248     | B.null tok = Just 0
  249     | B.length tok > B.length str = Nothing
  250     | otherwise = do n <- BC.elemIndex (BC.head tok) str
  251                      let ttok = B.tail tok
  252                          reststr = B.drop (n+1) str
  253                      if ttok == B.take (B.length ttok) reststr
  254                         then Just n
  255                         else ((n+1)+) `fmap` substrPS tok reststr
  256 
  257 ------------------------------------------------------------------------
  258 
  259 -- TODO: replace breakFirstPS and breakLastPS with definitions based on
  260 -- ByteString's break/breakEnd
  261 {-# INLINE breakFirstPS #-}
  262 breakFirstPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
  263 breakFirstPS c p = case BC.elemIndex c p of
  264                    Nothing -> Nothing
  265                    Just n -> Just (B.take n p, B.drop (n+1) p)
  266 
  267 {-# INLINE breakLastPS #-}
  268 breakLastPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
  269 breakLastPS c p = case BC.elemIndexEnd c p of
  270                   Nothing -> Nothing
  271                   Just n -> Just (B.take n p, B.drop (n+1) p)
  272 
  273 -- TODO: rename
  274 {-# INLINE linesPS #-}
  275 linesPS :: B.ByteString -> [B.ByteString]
  276 linesPS ps
  277      | B.null ps = [B.empty]
  278      | otherwise = BC.split '\n' ps
  279 
  280 {- QuickCheck property:
  281 
  282 import Test.QuickCheck
  283 import qualified Data.ByteString.Char8 as BC
  284 import Data.Char
  285 instance Arbitrary BC.ByteString where
  286     arbitrary = fmap BC.pack arbitrary
  287 instance Arbitrary Char where
  288   arbitrary = chr `fmap` choose (32,127)
  289 deepCheck = check (defaultConfig { configMaxTest = 10000})
  290 testLines =  deepCheck (\x -> (linesPS x == linesPSOld x))
  291 linesPSOld ps = case  BC.elemIndex '\n' ps of
  292              Nothing -> [ps]
  293              Just n -> B.take n ps : linesPS (B.drop (n+1) ps) -}
  294 
  295 {-| This function acts exactly like the "Prelude" unlines function, or like
  296 "Data.ByteString.Char8" 'unlines', but with one important difference: it will
  297 produce a string which may not end with a newline! That is:
  298 
  299 > unlinesPS ["foo", "bar"]
  300 
  301 evaluates to \"foo\\nbar\", not \"foo\\nbar\\n\"! This point should hold true for
  302 'linesPS' as well.
  303 
  304 TODO: rename this function. -}
  305 unlinesPS :: [B.ByteString] -> B.ByteString
  306 unlinesPS [] = BC.empty
  307 unlinesPS x  = BC.init $ BC.unlines x
  308 {-# INLINE unlinesPS #-}
  309 {- QuickCheck property:
  310 
  311 testUnlines = deepCheck (\x -> (unlinesPS x == unlinesPSOld x))
  312 unlinesPSOld ss = BC.concat $ intersperse_newlines ss
  313     where intersperse_newlines (a:b:s) = a : newline : intersperse_newlines (b:s)
  314           intersperse_newlines s = s
  315           newline = BC.pack "\n" -}
  316 
  317 -- -----------------------------------------------------------------------------
  318 -- gzReadFilePS
  319 
  320 -- |Decompress the given bytestring into a lazy list of chunks, along with a boolean
  321 -- flag indicating (if True) that the CRC was corrupted.
  322 -- Inspecting the flag will cause the entire list of chunks to be evaluated (but if
  323 -- you throw away the list immediately this should run in constant space).
  324 gzDecompress :: Maybe Int -> BL.ByteString -> ([B.ByteString], Bool)
  325 gzDecompress mbufsize =
  326     -- This is what the code would be without the bad CRC recovery logic:
  327     -- return . BL.toChunks . GZ.decompressWith decompressParams
  328     toListWarn . ZI.decompressWithErrors ZI.GZip decompressParams
  329   where
  330         decompressParams = case mbufsize of
  331                               Just bufsize -> GZ.defaultDecompressParams { GZ.decompressBufferSize = bufsize }
  332                               Nothing -> GZ.defaultDecompressParams
  333 
  334         toListWarn :: ZI.DecompressStream -> ([B.ByteString], Bool)
  335         toListWarn = foldDecompressStream (\x ~(xs, b) -> (x:xs, b)) ([], False) handleBad
  336 
  337         -- cut and paste from Zlib since it's not currently exported (interface not yet certain)
  338         foldDecompressStream :: (B.ByteString -> a -> a) -> a
  339                              -> (ZI.DecompressError -> String -> a)
  340                              -> ZI.DecompressStream -> a
  341         foldDecompressStream chunk end err = fold
  342                    where
  343                        fold ZI.StreamEnd               = end
  344                        fold (ZI.StreamChunk bs stream) = chunk bs (fold stream)
  345                        fold (ZI.StreamError code msg)  = err code msg
  346 
  347         -- For a while a bug in darcs caused gzip files with good data but bad CRCs to be
  348         -- produced. Trap bad CRC messages, run the specified action to report that it happened,
  349         -- but continue on the assumption that the data is valid.
  350         handleBad ZI.DataError "incorrect data check" = ([], True)
  351         handleBad _ msg = error msg
  352 
  353 isGZFile :: FilePath -> IO (Maybe Int)
  354 isGZFile f = do
  355     h <- openBinaryFile f ReadMode
  356     header <- B.hGet h 2
  357     if header /= BC.pack "\31\139"
  358        then do hClose h
  359                return Nothing
  360        else do hSeek h SeekFromEnd (-4)
  361                len <- hGetLittleEndInt h
  362                hClose h
  363                return (Just len)
  364 
  365 -- | Read an entire file, which may or may not be gzip compressed, directly
  366 -- into a 'B.ByteString'.
  367 gzReadFilePS :: FilePath -> IO B.ByteString
  368 gzReadFilePS f = do
  369     mlen <- isGZFile f
  370     case mlen of
  371        Nothing -> mmapFilePS f
  372        Just len ->
  373             do -- Passing the length to gzDecompress means that it produces produces one chunk,
  374                -- which in turn means that B.concat won't need to copy data.
  375                -- If the length is wrong this will just affect efficiency, not correctness
  376                let doDecompress buf = let (res, bad) = gzDecompress (Just len) buf
  377                                       in do when bad $ addCRCWarning f
  378                                             return res
  379                compressed <- (BL.fromChunks . return) `fmap` mmapFilePS f
  380                B.concat `fmap` doDecompress compressed
  381 
  382 hGetLittleEndInt :: Handle -> IO Int
  383 hGetLittleEndInt h = do
  384     b1 <- ord `fmap` hGetChar h
  385     b2 <- ord `fmap` hGetChar h
  386     b3 <- ord `fmap` hGetChar h
  387     b4 <- ord `fmap` hGetChar h
  388     return $ b1 + 256*b2 + 65536*b3 + 16777216*b4
  389 
  390 gzWriteFilePS :: FilePath -> B.ByteString -> IO ()
  391 gzWriteFilePS f ps = gzWriteFilePSs f [ps]
  392 
  393 gzWriteFilePSs :: FilePath -> [B.ByteString] -> IO ()
  394 gzWriteFilePSs f pss  =
  395     BL.writeFile f $ GZ.compress $ BL.fromChunks pss
  396 
  397 -- -----------------------------------------------------------------------------
  398 -- mmapFilePS
  399 
  400 -- | Like readFilePS, this reads an entire file directly into a
  401 -- 'B.ByteString', but it is even more efficient.  It involves directly
  402 -- mapping the file to memory.  This has the advantage that the contents of
  403 -- the file never need to be copied.  Also, under memory pressure the page
  404 -- may simply be discarded, wile in the case of readFilePS it would need to
  405 -- be written to swap.  If you read many small files, mmapFilePS will be
  406 -- less memory-efficient than readFilePS, since each mmapFilePS takes up a
  407 -- separate page of memory.  Also, you can run into bus errors if the file
  408 -- is modified.  NOTE: as with 'readFilePS', the string representation in
  409 -- the file is assumed to be ISO-8859-1.
  410 
  411 mmapFilePS :: FilePath -> IO B.ByteString
  412 #ifdef HAVE_MMAP
  413 mmapFilePS f = do
  414   x <- mmapFileByteString f Nothing
  415    `catch` (\_ -> do
  416                      size <- fileSize `fmap` getSymbolicLinkStatus f
  417                      if size == 0
  418                         then return B.empty
  419                         else performGC >> mmapFileByteString f Nothing)
  420   return x
  421 #else
  422 mmapFilePS = B.readFile
  423 #endif
  424 
  425 -- -------------------------------------------------------------------------
  426 -- fromPS2Hex
  427 
  428 foreign import ccall unsafe "static fpstring.h conv_to_hex" conv_to_hex
  429     :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
  430 
  431 fromPS2Hex :: B.ByteString -> B.ByteString
  432 fromPS2Hex ps = case BI.toForeignPtr ps of
  433           (x,s,l) ->
  434            BI.unsafeCreate (2*l) $ \p -> withForeignPtr x $ \f ->
  435            conv_to_hex p (f `plusPtr` s) $ fromIntegral l
  436 
  437 -- -------------------------------------------------------------------------
  438 -- fromHex2PS
  439 
  440 foreign import ccall unsafe "static fpstring.h conv_from_hex" conv_from_hex
  441     :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
  442 
  443 fromHex2PS :: B.ByteString -> B.ByteString
  444 fromHex2PS ps = case BI.toForeignPtr ps of
  445           (x,s,l) ->
  446            BI.unsafeCreate (l `div` 2) $ \p -> withForeignPtr x $ \f ->
  447            conv_from_hex p (f `plusPtr` s) (fromIntegral $ l `div` 2)
  448 
  449 -- -------------------------------------------------------------------------
  450 -- betweenLinesPS
  451 
  452 -- | betweenLinesPS returns the B.ByteString between the two lines given,
  453 -- or Nothing if they do not appear.
  454 
  455 betweenLinesPS :: B.ByteString -> B.ByteString -> B.ByteString
  456                -> Maybe (B.ByteString)
  457 betweenLinesPS start end ps
  458  = case break (start ==) (linesPS ps) of
  459        (_, _:rest@(bs1:_)) ->
  460            case BI.toForeignPtr bs1 of
  461             (ps1,s1,_) ->
  462              case break (end ==) rest of
  463                (_, bs2:_) -> case BI.toForeignPtr bs2 of (_,s2,_) -> Just $ fromForeignPtr ps1 s1 (s2 - s1)
  464                _ -> Nothing
  465        _ -> Nothing
  466 
  467 -- -------------------------------------------------------------------------
  468 -- break_after_nth_newline
  469 
  470 break_after_nth_newline :: Int -> B.ByteString
  471                         -> Maybe (B.ByteString, B.ByteString)
  472 break_after_nth_newline 0 the_ps | B.null the_ps = Just (B.empty, B.empty)
  473 break_after_nth_newline n the_ps =
  474   case BI.toForeignPtr the_ps of
  475   (fp,the_s,l) ->
  476    unsafePerformIO $ withForeignPtr fp $ \p ->
  477    do let findit 0 s | s == end = return $ Just (the_ps, B.empty)
  478           findit _ s | s == end = return Nothing
  479           findit 0 s = let left_l = s - the_s
  480                        in return $ Just (fromForeignPtr fp the_s left_l,
  481                                          fromForeignPtr fp s (l - left_l))
  482           findit i s = do w <- peekElemOff p s
  483                           if w == nl then findit (i-1) (s+1)
  484                                      else findit i (s+1)
  485           nl = BI.c2w '\n'
  486           end = the_s + l
  487       findit n the_s
  488 
  489 -- -------------------------------------------------------------------------
  490 -- break_before_nth_newline
  491 
  492 break_before_nth_newline :: Int -> B.ByteString -> (B.ByteString, B.ByteString)
  493 break_before_nth_newline 0 the_ps
  494  | B.null the_ps = (B.empty, B.empty)
  495 break_before_nth_newline n the_ps =
  496  case BI.toForeignPtr the_ps of
  497  (fp,the_s,l) ->
  498    unsafePerformIO $ withForeignPtr fp $ \p ->
  499    do let findit _ s | s == end = return (the_ps, B.empty)
  500           findit i s = do w <- peekElemOff p s
  501                           if w == nl
  502                             then if i == 0
  503                                  then let left_l = s - the_s
  504                                       in return (fromForeignPtr fp the_s left_l,
  505                                                  fromForeignPtr fp s (l - left_l))
  506                                  else findit (i-1) (s+1)
  507                             else findit i (s+1)
  508           nl = BI.c2w '\n'
  509           end = the_s + l
  510       findit n the_s