A Haskell program that uses the Conduit library to retrieve and store image files with Network.HTTP.Conduit, which are accessible via an URL with a date in it. The images are stored in an images
subdirectory with the date in the filename. The program skips URLs that do not return a HTTP status code of 200
(e.g. weekends, holidays, or other periods without images available) and generates an exception if the status code is not in the 2xx
range. The exception is rethrown when the status code is not 404
. This all has no practical use at all but I wanted to see how exceptions are handled in conduits.
src/ImageDownloader.hs
Note the sinkList
and sourceList
, which are a nice Conduit
feature and the use of catchC
.
{-# LANGUAGE ScopedTypeVariables #-}
module ImageDownloader where
import Control.Exception (throwIO)
import Control.Monad (unless, void, when)
import Control.Monad.IO.Class (liftIO)
import Data.Conduit (catchC, runConduitRes, (.|))
import qualified Data.Conduit.Binary as CB
import qualified Data.Conduit.Combinators as CC
import qualified Data.Conduit.List as CL
import Data.List.Split (chunksOf)
import Data.Time
import Network.HTTP.Client (BodyReader, HttpException,
HttpExceptionContent (StatusCodeException),
checkResponse, responseBody,
responseStatus)import Network.HTTP.Simple
import Network.HTTP.Types (statusCode)
import System.Directory (doesFileExist)
outputDir :: String
= "images/"
outputDir
statusValid :: Int
= 200
statusValid
previousDay :: Day -> Day
= addDays (-1)
previousDay
show' :: Int -> String
show' n| length (show n) == 1 = "0" ++ show n
| otherwise = show n
previousDate :: String -> String -> String
=
previousDate dat yyNow let
= chunksOf 2 dat
[yy,mm,dd] year :: Integer) = read yy
(month :: Int) = read mm
(day :: Int) = read dd
(thisYear :: Integer) = read yyNow
(= if year > thisYear then 1900 + year else 2000 + year
year' = fromGregorian year' month day
days = previousDay days
pDays = toGregorian pDays
(pyy,pmm,pdd) = chunksOf 2 $ show pyy
[_,pyy'] in
++ show' pmm ++ show' pdd
pyy'
-- | Throw 'HttpException' exception on server errors (not 2xx).
checkHttpResponse :: Request -> Response BodyReader -> IO ()
=
checkHttpResponse request response let sc = statusCode (responseStatus response)
in
`div` 100 /= 2)
when (sc $ throwIO
$ HttpExceptionRequest
requestStatusCodeException (void response) mempty)
(
processUrl' :: String -> String -> String -> String -> IO ()
= do
processUrl' yyNow url dat ext let pathName = outputDir ++ dat ++ ext
<- doesFileExist pathName
fe $ do
unless fe -- rq <- parseRequest fullUrl
<- parseRequest fullUrl
rq' let rq = rq' { checkResponse = checkHttpResponse }
<- runConduitRes
rs $ (httpSource rq getSrc `catchC`
e :: HttpException) -> do
(\(let HttpExceptionRequest _ (StatusCodeException ersp _) = e
= statusCode (responseStatus ersp)
erspsc $ putStrLn $ " CAUGHT EXCEPTION (HTTP status code=" ++ show erspsc ++ ")"
liftIO /= 404) $
when (erspsc $ throwIO
liftIO $ HttpExceptionRequest
rqStatusCodeException (void ersp) mempty)))
(.| CC.sinkList
null rs) $
unless (
runConduitRes$ CL.sourceList rs
.| CB.sinkFile pathName
processUrl' yyNow url (previousDate dat yyNow) extwhere
= url ++ dat ++ ext
fullUrl = do
getSrc res let sc = getResponseStatusCode res
-- Only when status valid; empty result otherwise.
== statusValid) $ do
when (sc
getResponseBody res$ print (fullUrl, getResponseStatus res, getResponseHeaders res)
liftIO
processUrl :: String -> String -> String -> IO ()
= do
processUrl url dat ext <- getCurrentTime
now let yyNow = formatTime defaultTimeLocale "%y" now
processUrl' yyNow url dat ext
app/Main.hs
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Main where
import qualified Options.Applicative as OA
import ImageDownloader
data Args = Args String String String
args :: OA.Parser Args
= Args
args <$> OA.strArgument
"url" <> OA.help "Input is a base url")
(OA.metavar <*> OA.strArgument
"date" <> OA.help "Input is a date (yymmdd) section --e.g. 211119")
(OA.metavar <*> OA.strArgument
"extension" <> OA.help "Input is a file extension --e.g. .jpg")
(OA.metavar
argsInfo :: OA.ParserInfo Args
= OA.info args OA.fullDesc
argsInfo
main :: IO ()
= do
main Args url dat ext <- OA.execParser argsInfo
processUrl url dat ext
package.yaml
Use the hpack
command to generate a cabal file for the project.
name: imagedownloader
version: 0.0.0.1
synopsis: Image downloader
description: |
Downloads images with date in URL.category: HTML
license: GPL-3
stability: development
ghc-options:
- -Wall
- -fno-warn-unused-do-bind
- -fno-warn-name-shadowing
- -fno-warn-missing-signatures
- -fno-warn-type-defaults
- -fno-warn-orphans
library:
source-dirs: src
dependencies:
- base
- bytestring
- conduit
- conduit-extra
- directory
- http-client
- http-conduit
- http-types
- resourcet
- split
- streaming-commons
- time
executables:
imagedownloader:
main: Main.hs
source-dirs: app
ghc-options: [-threaded]
dependencies:
- base
- optparse-applicative
- imagedownloader
Build and execute
hpack
cabal new-build
and to execute, use e.g.:
cabal new-run . -- http://www.yoursitename.com/images/image 211125 .jpg
Which will retrieve all images from the specified date in yymmdd
format (i.e. 2021, November 25th) backwards in time. Use Ctrl+C to abort the program when there are no more images to download. Watch the output of the program to determine this. The program will not redownload already downloaded images on a subsequent run.