{-# OPTIONS_GHC -Wno-type-defaults #-} module ATrade.Forums.Smartlab ( NewsItem(..), IndexItem(..), getIndex, getItem ) where import qualified Data.ByteString.Lazy as BL import qualified Data.Text as T import Data.Text.Encoding import qualified Data.List as L import Data.Time.Calendar import Data.Time.Clock import Data.Maybe import Network.HTTP.Simple import Safe import Text.HTML.TagSoup import Text.Parsec import Text.Parsec.Text import Text.StringLike import Debug.Trace data NewsItem = NewsItem { niUrl :: !T.Text, niHeader :: !T.Text, niText :: !T.Text, niAuthor :: !T.Text, niPubTime :: !UTCTime } deriving (Show, Eq) data IndexItem = IndexItem { iiUrl :: !T.Text, iiTitle :: !T.Text, iiPubTime :: !UTCTime } deriving (Show, Eq) monthNames :: [T.Text] monthNames = fmap T.pack ["января", "февраля", "марта", "апреля", "мая", "июня", "июля", "августа", "сентября", "октября", "ноября", "декабря"] extractBetween :: StringLike str => String -> [Tag str] -> [Tag str] extractBetween tagName = takeWhile (~/= closeTag) . dropWhile (~/= openTag) where openTag = "<" ++ tagName ++ ">" closeTag = "" ++ tagName ++ ">" matchClass :: T.Text -> T.Text -> Tag T.Text -> Bool matchClass _ className (TagOpen _ attrs) = case L.lookup (T.pack "class") attrs of Just klass -> className `L.elem` T.words klass Nothing -> False matchClass _ _ _ = False parseTimestamp :: T.Text -> Maybe UTCTime parseTimestamp text = case parse timestampParser "" text of Left _ -> Nothing Right val -> Just val where timestampParser :: Parser UTCTime timestampParser = do spaces day <- read <$> many1 digit spaces monthName <- T.pack <$> many1 letter case L.elemIndex monthName monthNames of Nothing -> fail "Can't parse month" Just month -> do spaces year <- fromIntegral . read <$> many1 digit _ <- char ',' spaces hour <- fromIntegral . read <$> many1 digit _ <- char ':' minute <- fromIntegral . read <$> many1 digit return $ UTCTime (fromGregorian year (month + 1) day) (hour * 3600 + minute * 60) getItem :: IndexItem -> IO (Maybe NewsItem) getItem indexItem = do rq <- parseRequest $ T.unpack (iiUrl indexItem) resp <- httpLBS rq if getResponseStatusCode resp == 200 then return . parseItem . decodeUtf8 . BL.toStrict . getResponseBody $ resp else return Nothing where parseItem rawHtml = case parseTimestamp timestamp of Just itemPubtime -> Just NewsItem { niUrl = iiUrl indexItem, niHeader = itemHeader, niText = itemText, niAuthor = itemAuthor, niPubTime = itemPubtime } Nothing -> Nothing where itemHeader = innerText . extractBetween "span" . extractBetween "h1" . dropWhile (not . matchClass (T.pack "div") (T.pack "topic")) $ tags itemText = innerText . extractBetween "div" . dropWhile (not . matchClass (T.pack "div") (T.pack "content")) . dropWhile (~/= "