{- |
   module: Text.CSV 
   license: BSD
   maintainer: peteg42 at gmail dot com
   stability: provisional 
   portability: ghc 

   Based on RFC 4180, \"Common Format and MIME Type for
   Comma-Separated Values (CSV) Files\",
   <http://www.rfc-editor.org/rfc/rfc4180.txt>. ABNF as follows:

   file = [header CRLF] record *(CRLF record) [CRLF]

   header = name *(COMMA name)

   record = field *(COMMA field)

   name = field

   field = (escaped / non-escaped)

   escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE

   non-escaped = *TEXTDATA

   COMMA = %x2C

   CR = %x0D ;as per section 6.1 of RFC 2234 [2]

   DQUOTE =  %x22 ;as per section 6.1 of RFC 2234 [2]

   LF = %x0A ;as per section 6.1 of RFC 2234 [2]

   CRLF = CR LF ;as per section 6.1 of RFC 2234 [2]

   TEXTDATA =  %x20-21 / %x23-2B / %x2D-7E

   We generalise slighty: allow all characters, accept any sequence of
   CRs and LFs as a line break. We do not special case the header line.

   In contrast to other CSV parsers, this one is lazy. Parsing
   failures are signalled by a call to FIXME @error@.

   (C)opyright Peter Gammie, peteg42 at gmail.com, commenced Marc 2008.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:

   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

   2. Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

   THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
   OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
   BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
   USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
   DAMAGE.

   (This licence is intended to be in the spirit of the BSD licence.)

-}

module Import.CSV
    ( CSV
    , Record
    , Field
    , parseCSV
    ) where

import Monad ( liftM, liftM2 )

-- | A CSV file is a series of records. According to the RFC, the
-- records all have to have the same length. As an extension, I
-- allow variable length records.
type CSV = [Record]

-- | A record is a series of fields
type Record = [Field]

-- | A field is a string
type Field = String

-- | Turn a 'String' into a lazy list of 'Record's.
parseCSV :: String -> CSV
parseCSV = snd . (runParser csvP)

-- | Lazy monadic parsing. No backtracking or cleverness required,
-- LL(1) is fine.
newtype Parser a = Parser { runParser :: String -> (String, a) }

-- Bind better be lazy.
instance Monad Parser where
    return a = Parser $ \s -> (s, a)
    Parser f >>= g = Parser $ \s ->
                       let (s',  a) = f s
                           Parser ga = g a
                       in ga s'

pEOF :: Parser Bool
pEOF = Parser $ \s -> (s, null s)

isChar :: (Char -> Bool) -> Parser Bool
isChar f = Parser $ \s ->
  case s of
    []    -> ([], False)
    (c:_) -> (s, f c)

pGetChar :: Parser Char
pGetChar = Parser $ \(c:cs) -> (cs, c)

pChoose :: Parser Bool -> Parser a -> Parser a -> Parser a
pChoose f a b =
  do c <- f
     if c then a else b

pMaybeMunchChar :: (Char -> Bool) -> Parser Bool
pMaybeMunchChar f = Parser $ \s ->
  case s of
    []     -> (s, False)
    (c:cs) -> if f c then (cs, True) else (s, False)

pIfNotEOF :: Parser [a] -> Parser [a]
pIfNotEOF p = pChoose pEOF (return []) p

-- | Use the given parser repeatedly until @f@ says 'False' or
-- | EOF. @f@ gets first go.
pMany :: Parser Bool -> Parser a -> Parser [a]
pMany f p = pIfNotEOF $ pChoose f (liftM2 (:) p (pMany f p)) (return [])

-- | Use the given parser once, and then repeatedly use it until f
-- says 'False' or EOF.
pMany1 :: Parser Bool -> Parser a -> Parser [a]
pMany1 f p = pIfNotEOF $ liftM2 (:) p (pChoose f (pMany1 f p) (return []))

----------------------------------------

csvP :: Parser CSV
csvP = pMany1 munchCRLFP recordP
  where
   munchCRLFP :: Parser Bool
   munchCRLFP = liftM (() `elem`) $ pMany mCRLF1P (return ())
     where
       mCRLF1P = pMaybeMunchChar isEOL

recordP :: Parser Record
recordP = pMany1 isComma (pChoose isDoubleQuote quotedFieldP fieldP)
  where
    isComma = pMaybeMunchChar (== ',')
    isDoubleQuote = pMaybeMunchChar (== '"')

fieldP :: Parser Field
fieldP = pMany isNotCommaOrEOL pGetChar
  where
    isNotCommaOrEOL = isChar (\c -> not (c == ',' || isEOL c))

-- | FIXME if EOF occurs in the middle of the field, we accept it.
quotedFieldP :: Parser Field
quotedFieldP = pMany isNotUnescapedDoubleQuote pGetChar
  where
    isNotUnescapedDoubleQuote =
      pChoose (pMaybeMunchChar (== '"'))
              (isChar (== '"')) -- white lie
              (return True)

isEOL :: Char -> Bool
isEOL = (`elem` ['\xA', '\xD'])
