Date in URL Image scraper
A Haskell program that uses the Conduit library to retrieve and store image files with Network.HTTP.Conduit, which are accessible via an URL with a date in it. The images are stored in an images
subdirectory with the date in the filename. The program skips URLs that do not return a HTTP status code of 200
(e.g. weekends, holidays, or other periods without images available) and generates an exception if the status code is not in the 2xx
range. The exception is rethrown when the status code is not 404
. This all has no practical use at all but I wanted to see how exceptions are handled in conduits.
src/ImageDownloader.hs
Note the sinkList
and sourceList
, which are a nice Conduit
feature and the use of catchC
.
1{-# LANGUAGE ScopedTypeVariables #-}
2module ImageDownloader where
3
4import Control.Exception (throwIO)
5import Control.Monad (unless, void, when)
6import Control.Monad.IO.Class (liftIO)
7import Data.Conduit (catchC, runConduitRes, (.|))
8import qualified Data.Conduit.Binary as CB
9import qualified Data.Conduit.Combinators as CC
10import qualified Data.Conduit.List as CL
11import Data.List.Split (chunksOf)
12import Data.Time
13import Network.HTTP.Client (BodyReader, HttpException,
14 HttpExceptionContent (StatusCodeException),
15 checkResponse, responseBody,
16 responseStatus)
17import Network.HTTP.Simple
18import Network.HTTP.Types (statusCode)
19import System.Directory (doesFileExist)
20
21outputDir :: String
22outputDir = "images/"
23
24statusValid :: Int
25statusValid = 200
26
27previousDay :: Day -> Day
28previousDay = addDays (-1)
29
30show' :: Int -> String
31show' n
32 | length (show n) == 1 = "0" ++ show n
33 | otherwise = show n
34
35previousDate :: String -> String -> String
36previousDate dat yyNow =
37 let
38 [yy,mm,dd] = chunksOf 2 dat
39 (year :: Integer) = read yy
40 (month :: Int) = read mm
41 (day :: Int) = read dd
42 (thisYear :: Integer) = read yyNow
43 year' = if year > thisYear then 1900 + year else 2000 + year
44 days = fromGregorian year' month day
45 pDays = previousDay days
46 (pyy,pmm,pdd) = toGregorian pDays
47 [_,pyy'] = chunksOf 2 $ show pyy
48 in
49 pyy' ++ show' pmm ++ show' pdd
50
51-- | Throw 'HttpException' exception on server errors (not 2xx).
52checkHttpResponse :: Request -> Response BodyReader -> IO ()
53checkHttpResponse request response =
54 let sc = statusCode (responseStatus response)
55 in
56 when (sc `div` 100 /= 2)
57 $ throwIO
58 $ HttpExceptionRequest
59 request
60 (StatusCodeException (void response) mempty)
61
62processUrl' :: String -> String -> String -> String -> IO ()
63processUrl' yyNow url dat ext = do
64 let pathName = outputDir ++ dat ++ ext
65 fe <- doesFileExist pathName
66 unless fe $ do
67 -- rq <- parseRequest fullUrl
68 rq' <- parseRequest fullUrl
69 let rq = rq' { checkResponse = checkHttpResponse }
70 rs <- runConduitRes
71 $ (httpSource rq getSrc `catchC`
72 (\(e :: HttpException) -> do
73 let HttpExceptionRequest _ (StatusCodeException ersp _) = e
74 erspsc = statusCode (responseStatus ersp)
75 liftIO $ putStrLn $ " CAUGHT EXCEPTION (HTTP status code=" ++ show erspsc ++ ")"
76 when (erspsc /= 404) $
77 liftIO $ throwIO
78 $ HttpExceptionRequest
79 rq
80 (StatusCodeException (void ersp) mempty)))
81 .| CC.sinkList
82 unless (null rs) $
83 runConduitRes
84 $ CL.sourceList rs
85 .| CB.sinkFile pathName
86 processUrl' yyNow url (previousDate dat yyNow) ext
87 where
88 fullUrl = url ++ dat ++ ext
89 getSrc res = do
90 let sc = getResponseStatusCode res
91 -- Only when status valid; empty result otherwise.
92 when (sc == statusValid) $ do
93 getResponseBody res
94 liftIO $ print (fullUrl, getResponseStatus res, getResponseHeaders res)
95
96processUrl :: String -> String -> String -> IO ()
97processUrl url dat ext = do
98 now <- getCurrentTime
99 let yyNow = formatTime defaultTimeLocale "%y" now
100 processUrl' yyNow url dat ext
app/Main.hs
1{-# LANGUAGE OverloadedStrings #-}
2{-# LANGUAGE ScopedTypeVariables #-}
3
4module Main where
5
6import qualified Options.Applicative as OA
7import ImageDownloader
8
9data Args = Args String String String
10
11args :: OA.Parser Args
12args = Args
13 <$> OA.strArgument
14 (OA.metavar "url" <> OA.help "Input is a base url")
15 <*> OA.strArgument
16 (OA.metavar "date" <> OA.help "Input is a date (yymmdd) section --e.g. 211119")
17 <*> OA.strArgument
18 (OA.metavar "extension" <> OA.help "Input is a file extension --e.g. .jpg")
19
20argsInfo :: OA.ParserInfo Args
21argsInfo = OA.info args OA.fullDesc
22
23main :: IO ()
24main = do
25 Args url dat ext <- OA.execParser argsInfo
26 processUrl url dat ext
package.yaml
Use the hpack
command to generate a cabal file for the project.
1name: imagedownloader
2version: 0.0.0.1
3synopsis: Image downloader
4description: |
5 Downloads images with date in URL.
6category: HTML
7license: GPL-3
8stability: development
9
10ghc-options:
11- -Wall
12- -fno-warn-unused-do-bind
13- -fno-warn-name-shadowing
14- -fno-warn-missing-signatures
15- -fno-warn-type-defaults
16- -fno-warn-orphans
17
18library:
19 source-dirs: src
20 dependencies:
21 - base
22 - bytestring
23 - conduit
24 - conduit-extra
25 - directory
26 - http-client
27 - http-conduit
28 - http-types
29 - resourcet
30 - split
31 - streaming-commons
32 - time
33
34executables:
35 imagedownloader:
36 main: Main.hs
37 source-dirs: app
38 ghc-options: [-threaded]
39 dependencies:
40 - base
41 - optparse-applicative
42 - imagedownloader
Build and execute
1hpack
2cabal new-build
and to execute, use e.g.:
1cabal new-run . -- http://www.yoursitename.com/images/image 211125 .jpg
Which will retrieve all images from the specified date in yymmdd
format (i.e. 2021, November 25th) backwards in time. Use Ctrl+C to abort the program when there are no more images to download. Watch the output of the program to determine this. The program will not redownload already downloaded images on a subsequent run.