Skip to content

Commit 74b1d0f

Browse files
committed
replaced HTML web scraping examples
1 parent 8320fca commit 74b1d0f

File tree

5 files changed

+48
-70
lines changed

5 files changed

+48
-70
lines changed

WebScraping/HandsomeSoupTest.hs

Lines changed: 0 additions & 31 deletions
This file was deleted.

WebScraping/HttpClientExample.hs

Lines changed: 0 additions & 26 deletions
This file was deleted.

WebScraping/README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,13 @@
33
## Run examples using stack
44

55
````````
6-
stack build --exec HttpClientExample
7-
stack build --exec HandsomeSoupTest
6+
stack build --exec TagSoupTest
87
````````
98

109
## Run examples using cabal:
1110

1211
````````
1312
cabal build
14-
cabal run HandsomeSoupTest
13+
cabal run TagSoupTest
1514
````````
1615

WebScraping/TagSoupTest.hs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{-# LANGUAGE OverloadedStrings #-}
2+
3+
import Network.HTTP.Simple
4+
import Text.HTML.TagSoup
5+
import Data.Text (Text)
6+
import qualified Data.Text as T
7+
import qualified Data.Text.IO as TIO
8+
import qualified Data.ByteString.Lazy.Char8 as BL8
9+
import Data.Maybe (mapMaybe)
10+
11+
main :: IO ()
12+
main = do
13+
-- Fetch the HTML content
14+
response <- httpLBS "https://markwatson.com/"
15+
let body = BL8.unpack $ getResponseBody response
16+
tags = parseTags body
17+
18+
-- Extract and print headers
19+
let headers = getResponseHeaders response
20+
putStrLn "Headers:"
21+
mapM_ print headers
22+
23+
-- Extract and print all text content
24+
let texts = extractTexts tags
25+
putStrLn "\nText Content:"
26+
TIO.putStrLn texts
27+
28+
-- Extract and print all links
29+
let links = extractLinks tags
30+
putStrLn "\nLinks:"
31+
mapM_ TIO.putStrLn links
32+
33+
-- Function to extract all text content from tags
34+
extractTexts :: [Tag String] -> Text
35+
extractTexts = T.unwords . map (T.strip . T.pack) . filter (not . null) . mapMaybe maybeTagText
36+
37+
-- Function to extract all links from tags
38+
extractLinks :: [Tag String] -> [Text]
39+
extractLinks = map (T.pack . fromAttrib "href") . filter isATag
40+
where
41+
isATag (TagOpen "a" _) = True
42+
isATag _ = False

WebScraping/WebScraping.cabal

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,10 @@ category: dev
1212
build-type: Simple
1313
cabal-version: >=1.22.4.0
1414

15-
executable HttpClientExample
15+
executable TagSoupTest
1616
hs-source-dirs: .
17-
main-is: HttpClientExample.hs
17+
main-is: TagSoupTest.hs
1818
default-language: Haskell2010
1919
build-depends: base >= 4.7 && < 5, wreq, lens, bytestring,
20-
hxt, json >= 0.10
21-
22-
executable HandsomeSoupTest
23-
hs-source-dirs: .
24-
main-is: HandsomeSoupTest.hs
25-
default-language: Haskell2010
26-
build-depends: base >= 4.7 && < 5, wreq, lens, bytestring,
27-
HandsomeSoup, hxt, json >= 0.10
20+
json >= 0.10, text >=2.1,
21+
http-conduit >= 2.3.8.3, tagsoup >= 0.14.8

0 commit comments

Comments
 (0)