Execution layer for algorithmic trading
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

154 lines
4.9 KiB

7 years ago
{-# OPTIONS_GHC -Wno-type-defaults #-}
module ATrade.Forums.Smartlab (
NewsItem(..),
IndexItem(..),
getIndex,
getItem
) where
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import Data.Text.Encoding
import qualified Data.List as L
import Data.Time.Calendar
import Data.Time.Clock
import Data.Maybe
import Network.HTTP.Simple
import Safe
import Text.HTML.TagSoup
import Text.Parsec
import Text.Parsec.Text
import Text.StringLike
import Debug.Trace
data NewsItem = NewsItem {
niUrl :: !T.Text,
niHeader :: !T.Text,
niText :: !T.Text,
niAuthor :: !T.Text,
niPubTime :: !UTCTime
} deriving (Show, Eq)
data IndexItem = IndexItem {
iiUrl :: !T.Text,
iiTitle :: !T.Text,
iiPubTime :: !UTCTime
} deriving (Show, Eq)
monthNames :: [T.Text]
monthNames = fmap T.pack ["января", "февраля", "марта", "апреля", "мая", "июня", "июля", "августа", "сентября", "октября", "ноября", "декабря"]
extractBetween :: StringLike str => String -> [Tag str] -> [Tag str]
extractBetween tagName = takeWhile (~/= closeTag) . dropWhile (~/= openTag)
where
openTag = "<" ++ tagName ++ ">"
closeTag = "</" ++ tagName ++ ">"
matchClass :: T.Text -> T.Text -> Tag T.Text -> Bool
matchClass _ className (TagOpen _ attrs) = case L.lookup (T.pack "class") attrs of
Just klass -> className `L.elem` T.words klass
Nothing -> False
matchClass _ _ _ = False
parseTimestamp :: T.Text -> Maybe UTCTime
parseTimestamp text = case parse timestampParser "" text of
Left _ -> Nothing
Right val -> Just val
where
timestampParser :: Parser UTCTime
timestampParser = do
spaces
day <- read <$> many1 digit
spaces
monthName <- T.pack <$> many1 letter
case L.elemIndex monthName monthNames of
Nothing -> fail "Can't parse month"
Just month -> do
spaces
year <- fromIntegral . read <$> many1 digit
_ <- char ','
spaces
hour <- fromIntegral . read <$> many1 digit
_ <- char ':'
minute <- fromIntegral . read <$> many1 digit
return $ UTCTime (fromGregorian year (month + 1) day) (hour * 3600 + minute * 60)
getItem :: IndexItem -> IO (Maybe NewsItem)
getItem indexItem = do
rq <- parseRequest $ T.unpack (iiUrl indexItem)
resp <- httpLBS rq
if getResponseStatusCode resp == 200
then return . parseItem . decodeUtf8 . BL.toStrict . getResponseBody $ resp
else return Nothing
where
parseItem rawHtml = case parseTimestamp timestamp of
Just itemPubtime -> Just NewsItem {
niUrl = iiUrl indexItem,
niHeader = itemHeader,
niText = itemText,
niAuthor = itemAuthor,
niPubTime = itemPubtime
}
Nothing -> Nothing
where
itemHeader = innerText .
extractBetween "span" .
extractBetween "h1" .
dropWhile (not . matchClass (T.pack "div") (T.pack "topic")) $ tags
itemText = innerText .
extractBetween "div" .
dropWhile (not . matchClass (T.pack "div") (T.pack "content")) .
dropWhile (~/= "<div id=content_box>") $ tags
itemAuthor = innerText .
extractBetween "li" .
dropWhile (not . matchClass (T.pack "li") (T.pack "author")) $ tags
timestamp = traceShowId $ innerText .
extractBetween "li" .
dropWhile (not . matchClass (T.pack "li") (T.pack "date")) $ tags
tags = parseTags rawHtml
getIndex :: T.Text -> Int -> IO ([IndexItem], Bool)
getIndex rootUrl pageNumber = do
rq <- parseRequest $ T.unpack $ makeUrl rootUrl pageNumber
resp <- httpLBS rq
return $ if getResponseStatusCode resp == 200
then parseIndex . decodeUtf8 . BL.toStrict . getResponseBody $ resp
else ([], False)
where
parseIndex :: T.Text -> ([IndexItem], Bool)
parseIndex x = (mapMaybe parseIndexEntry $ partitions (matchClass (T.pack "div") (T.pack "topic")) $ parseTags x, hasNextPage $ parseTags x)
parseIndexEntry :: [Tag T.Text] -> Maybe IndexItem
parseIndexEntry divTag = do
a <- headMay . dropWhile (~/= "<a>") $ divTag
let text = innerText . takeWhile (~/= "</a>") . dropWhile (~/= "<a>") $ divTag
case a of
TagOpen _ attr -> do
href <- L.lookup (T.pack "href") attr
ts <- parseTimestamp (innerText $ takeWhile (~/= "</li>") . dropWhile (not . matchClass (T.pack "li") (T.pack "date")) $ divTag)
Just IndexItem { iiUrl = href,
iiTitle = text,
iiPubTime = ts }
_ -> Nothing
makeUrl root pagenumber
| pagenumber == 0 || pagenumber == 1 = root
| otherwise = root `T.append` (T.pack "/page") `T.append` T.pack (show pagenumber)
hasNextPage tags = if pageNumber <= 1
then paginationLinksCount > 0
else paginationLinksCount > 1
where
paginationLinksCount = length . filter (~== "<a>") . extractBetween "p" . dropWhile (~/= "<div id=pagination>") $ tags