Skip to content

Commit 532584a

Browse files
committed
Initial project for hw-json-simple-cursor
1 parent 031dc80 commit 532584a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+65
-2576
lines changed

README.md

Lines changed: 4 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -1,150 +1,11 @@
1-
# hw-json
2-
[![master](https://circleci.com/gh/haskell-works/hw-json/tree/master.svg?style=svg)](https://circleci.com/gh/haskell-works/hw-json/tree/master)
1+
# hw-json-simple-cursor
2+
[![master](https://circleci.com/gh/haskell-works/hw-json-simple-cursor/tree/master.svg?style=svg)](https://circleci.com/gh/haskell-works/hw-json-simple-cursor/tree/master)
33

4-
`hw-json` is a succinct JSON parsing library.
4+
`hw-json-simple-cursor` is support library for `hw-json`, a succinct JSON parsing library.
55

66
It uses succinct data-structures to allow traversal of large JSON strings with minimal memory overhead.
77

8-
For an example, see [`app/Main.hs`](../master/app/Main.hs)
9-
10-
## Prerequisites
11-
12-
* `cabal` version `2.2` or later
13-
14-
## Memory benchmark
15-
16-
### Parsing large Json files in Scala with Argonaut
17-
18-
```text
19-
S0U EU OU MU CCSU CMD
20-
--------- --------- ----------- -------- -------- ---------------------------------------------------------------
21-
0.0 80,526.3 76,163.6 72,338.6 13,058.6 sbt console
22-
0.0 536,660.4 76,163.6 72,338.6 13,058.6 import java.io._, argonaut._, Argonaut._
23-
0.0 552,389.1 76,163.6 72,338.6 13,058.6 val file = new File("/Users/jky/Downloads/78mbs.json"
24-
0.0 634,066.5 76,163.6 72,338.6 13,058.6 val array = new Array[Byte](file.length.asInstanceOf[Int])
25-
0.0 644,552.3 76,163.6 72,338.6 13,058.6 val is = new FileInputStream("/Users/jky/Downloads/78mbs.json")
26-
0.0 655,038.1 76,163.6 72,338.6 13,058.6 is.read(array)
27-
294,976.0 160,159.7 1,100,365.0 79,310.8 13,748.1 val json = new String(array)
28-
285,182.9 146,392.6 1,956,264.5 82,679.8 14,099.6 val data = Parse.parse(json)
29-
***********
30-
```
31-
32-
### Parsing large Json files in Haskell with Aeson
33-
34-
```haskell
35-
-- CMD -- Mem (MB)
36-
---------------------------------------------------------- -- --------
37-
import Control.DeepSeq -- 94
38-
import Data.Aeson -- 100
39-
import qualified Data.ByteString.Lazy as BSL -- 104
40-
bs <- BSL.readFile "../corpus/bench/hospitalisation.json" -- 105
41-
let !x = deepseq bs bs -- 146
42-
let !y = decode json78m :: Maybe Value -- 669
43-
```
44-
45-
### Parsing large Json files in Haskell with hw-json
46-
47-
```haskell
48-
-- CMD -- Mem (MB)
49-
--------------------------------------------------------------------- -- --------
50-
import Foreign -- 93
51-
import Control.Monad -- 95
52-
import Data.Word -- 96
53-
import HaskellWorks.Data.BalancedParens.Simple -- 97
54-
import HaskellWorks.Data.Bits.BitShown -- 98
55-
import HaskellWorks.Data.FromForeignRegion -- 99
56-
import HaskellWorks.Data.Json.Backend.Standard.Cursor -- 106
57-
import System.IO.MMap -- 109
58-
import qualified Data.ByteString as BS -- 110
59-
import qualified Data.Vector.Storable as DVS -- 111
60-
import qualified HaskellWorks.Data.ByteString as BS -- 112
61-
import qualified HaskellWorks.Data.Json.Backend.Standard.Fast as FAST -- 114
62-
bs <- BS.mmap "../corpus/bench/hospitalisation.json" -- 115
63-
let !cursor = FAST.makeCursor bs -- 203
64-
```
65-
66-
## Examples
67-
68-
### Navigation example
69-
70-
```haskell
71-
import Control.Monad
72-
import Data.String
73-
import Data.Word
74-
import HaskellWorks.Data.BalancedParens.Simple
75-
import HaskellWorks.Data.Bits.BitShow
76-
import HaskellWorks.Data.Bits.BitShown
77-
import HaskellWorks.Data.FromForeignRegion
78-
import HaskellWorks.Data.Json.Backend.Standard.Cursor
79-
import HaskellWorks.Data.Json.Internal.Token.Types
80-
import HaskellWorks.Data.RankSelect.Base.Rank0
81-
import HaskellWorks.Data.RankSelect.Base.Rank1
82-
import HaskellWorks.Data.RankSelect.Base.Select1
83-
import HaskellWorks.Data.RankSelect.CsPoppy
84-
import System.IO.MMap
85-
86-
import qualified Data.ByteString as BS
87-
import qualified Data.Vector.Storable as DVS
88-
import qualified HaskellWorks.Data.Json.Backend.Standard.Cursor as C
89-
import qualified HaskellWorks.Data.Json.Backend.Standard.Fast as FAST
90-
import qualified HaskellWorks.Data.TreeCursor as TC
91-
92-
let fc = TC.firstChild
93-
let ns = TC.nextSibling
94-
let pn = TC.parent
95-
let ss = TC.subtreeSize
96-
let cursor = FAST.makeCursor "[null, {\"field\": 1}]"
97-
cursor
98-
fc cursor
99-
(fc >=> ns) cursor
100-
```
101-
102-
### Querying example
103-
104-
```haskell
105-
import Control.Monad
106-
import Data.Function
107-
import Data.List
108-
import HaskellWorks.Data.Json.Backend.Standard.Load.Cursor
109-
import HaskellWorks.Data.Json.Backend.Standard.Load.Partial
110-
import HaskellWorks.Data.Json.Backend.Standard.Load.Raw
111-
import HaskellWorks.Data.Json.PartialValue
112-
import HaskellWorks.Data.MQuery
113-
import HaskellWorks.Data.MQuery.Micro
114-
import HaskellWorks.Data.MQuery.Row
115-
116-
import qualified Data.DList as DL
117-
118-
!cursor <- loadPartial "../corpus/bench/78mb.json"
119-
!cursor <- loadCursorWithIndex "../corpus/bench/78mb.json"
120-
!cursor <- loadCursor "../corpus/bench/78mb.json"
121-
!cursor <- loadCursorWithCsPoppyIndex "../corpus/bench/78mb.json"
122-
let !json = jsonPartialJsonValueAt cursor
123-
let q = MQuery (DL.singleton json)
124-
125-
putPretty $ q >>= item & limit 10
126-
putPretty $ q >>= item & page 10 1
127-
putPretty $ q >>= item >>= hasKV "founded_year" (JsonPartialNumber 2005) & limit 10
128-
putPretty $ q >>= item >>= entry
129-
putPretty $ q >>= item >>= entry >>= named "name" & limit 10
130-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> entry >=> named "price_currency_code")
131-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> entry >=> named "price_currency_code") & onList (uniq . sort)
132-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> entry >=> named "price_currency_code" >=> asString >=> valueOf "USD") & limit 10
133-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> having (entry >=> named "price_currency_code" >=> asString >=> valueOf "USD") >=> entry >=> named "price_amount") & limit 10
134-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> having (entry >=> named "price_currency_code" >=> asString >=> valueOf "USD") >=> entry >=> named "price_amount" >=> castAsInteger ) & limit 10
135-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> having (entry >=> named "price_currency_code" >=> asString >=> valueOf "USD") >=> entry >=> named "price_amount" >=> castAsInteger ) & aggregate sum
136-
137-
putPretty $ q >>= item & limit 10
138-
putPretty $ q >>= item & page 10 1
139-
putPretty $ q >>= item >>= entry
140-
putPretty $ q >>= item >>= entry >>= named "name" & limit 10
141-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> entry >=> named "price_currency_code" >=> asString)
142-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> entry >=> named "price_currency_code" >=> asString) & onList (uniq . sort)
143-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> entry >=> named "price_currency_code" >=> asString >=> valueOf "USD") & limit 10
144-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> having (entry >=> named "price_currency_code" >=> asString >=> valueOf "USD") >=> entry >=> named "price_amount") & limit 10
145-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> having (entry >=> named "price_currency_code" >=> asString >=> valueOf "USD") >=> entry >=> named "price_amount" >=> castAsInteger ) & limit 10
146-
putPretty $ q >>= (item >=> entry >=> named "acquisition" >=> having (entry >=> named "price_currency_code" >=> asString >=> valueOf "USD") >=> entry >=> named "price_amount" >=> castAsInteger ) & aggregate sum
147-
```
8+
For more information see [`hw-json`](https://github.com/haskell-works/hw-json).
1489

14910
## References
15011

app/App/Commands.hs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
module App.Commands where
22

3-
import App.Commands.Count
43
import App.Commands.CreateIndex
5-
import App.Commands.Demo
64
import Data.Semigroup ((<>))
75
import Options.Applicative
86

@@ -13,5 +11,3 @@ commandsGeneral :: Parser (IO ())
1311
commandsGeneral = subparser $ mempty
1412
<> commandGroup "Commands:"
1513
<> cmdCreateIndex
16-
<> cmdCount
17-
<> cmdDemo

app/App/Commands/Count.hs

Lines changed: 0 additions & 85 deletions
This file was deleted.

app/App/Commands/CreateIndex.hs

Lines changed: 8 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -8,82 +8,25 @@ module App.Commands.CreateIndex
88
) where
99

1010
import Control.Lens
11-
import Control.Monad
1211
import Data.Generics.Product.Any
1312
import Data.Maybe
1413
import Data.Semigroup ((<>))
1514
import Data.Word
1615
import Foreign
1716
import Options.Applicative hiding (columns)
1817

19-
import qualified App.Commands.Types as Z
20-
import qualified Data.ByteString as BS
21-
import qualified Data.ByteString.Internal as BSI
22-
import qualified Data.ByteString.Lazy as LBS
23-
import qualified HaskellWorks.Data.ByteString as BS
24-
import qualified HaskellWorks.Data.ByteString.Lazy as LBS
25-
import qualified HaskellWorks.Data.Json.Simd.Index.Standard as STSI
26-
import qualified HaskellWorks.Data.Json.Simple.Cursor.SemiIndex as SISI
27-
import qualified HaskellWorks.Data.Json.Standard.Cursor.Internal.Blank as J
28-
import qualified HaskellWorks.Data.Json.Standard.Cursor.Internal.BlankedJson as J
29-
import qualified HaskellWorks.Data.Json.Standard.Cursor.Internal.MakeIndex as J
30-
import qualified HaskellWorks.Data.Json.Standard.Cursor.Internal.ToBalancedParens64 as J
31-
import qualified HaskellWorks.Data.Json.Standard.Cursor.SemiIndex as STSI
32-
import qualified System.Exit as IO
33-
import qualified System.IO as IO
34-
import qualified System.IO.MMap as IO
18+
import qualified App.Commands.Types as Z
19+
import qualified Data.ByteString.Internal as BSI
20+
import qualified Data.ByteString.Lazy as LBS
21+
import qualified HaskellWorks.Data.ByteString.Lazy as LBS
22+
import qualified HaskellWorks.Data.Json.Simple.Cursor.SemiIndex as SISI
23+
import qualified System.IO.MMap as IO
3524

3625
{-# ANN module ("HLint: ignore Reduce duplication" :: String) #-}
3726
{-# ANN module ("HLint: ignore Redundant do" :: String) #-}
3827

39-
runCreateIndexStandard :: Z.CreateIndexOptions -> IO ()
40-
runCreateIndexStandard opts = do
41-
let filePath = opts ^. the @"filePath"
42-
let outputIbFile = opts ^. the @"outputIbFile" & fromMaybe (filePath <> ".ib.idx")
43-
let outputBpFile = opts ^. the @"outputBpFile" & fromMaybe (filePath <> ".bp.idx")
44-
case opts ^. the @"method" of
45-
"original" -> do
46-
(fptr :: ForeignPtr Word8, offset, size) <- IO.mmapFileForeignPtr filePath IO.ReadOnly Nothing
47-
let !bs = BSI.fromForeignPtr (castForeignPtr fptr) offset size
48-
let blankedJson = J.blankJson [bs]
49-
let ibs = LBS.fromChunks (J.blankedJsonToInterestBits blankedJson)
50-
let bps = J.toBalancedParens64 (J.BlankedJson blankedJson)
51-
LBS.writeFile outputIbFile ibs
52-
LBS.writeFile outputBpFile (LBS.toLazyByteString bps)
53-
"alternate" -> do
54-
(fptr :: ForeignPtr Word8, offset, size) <- IO.mmapFileForeignPtr filePath IO.ReadOnly Nothing
55-
let !bs = BSI.fromForeignPtr (castForeignPtr fptr) offset size
56-
let STSI.SemiIndex ib bp = STSI.buildSemiIndex bs
57-
BS.writeFile outputIbFile (BS.toByteString ib)
58-
BS.writeFile outputBpFile (BS.toByteString bp)
59-
"tabular" -> do
60-
lbs <- LBS.readFile filePath
61-
let siChunks = STSI.toIbBpBuilders (STSI.buildFromByteString3 (BS.resegmentPadded 64 (LBS.toChunks lbs)))
62-
IO.withFile outputIbFile IO.WriteMode $ \hIb ->
63-
IO.withFile outputBpFile IO.WriteMode $ \hBp ->
64-
forM_ siChunks $ \(STSI.SiChunk ib bp) -> do
65-
LBS.hPut hIb (LBS.toLazyByteString ib)
66-
LBS.hPut hBp (LBS.toLazyByteString bp)
67-
"simd" -> do
68-
IO.withFile filePath IO.ReadMode $ \hIn -> do
69-
contents <- LBS.resegmentPadded 512 <$> LBS.hGetContents hIn
70-
case STSI.makeStandardJsonIbBps contents of
71-
Right chunks -> do
72-
IO.withFile outputIbFile IO.WriteMode $ \hIb -> do
73-
IO.withFile outputBpFile IO.WriteMode $ \hBp -> do
74-
forM_ chunks $ \(ibBs, bpBs) -> do
75-
BS.hPut hIb ibBs
76-
BS.hPut hBp bpBs
77-
Left msg -> IO.hPutStrLn IO.stderr $ "Unable to create index: " <> show msg
78-
"sum" -> do
79-
lbs <- LBS.resegmentPadded 64 <$> LBS.readFile filePath
80-
IO.putStrLn $ "Sum: " <> show (sum (BS.foldl (\a b -> a + fromIntegral b) (0 :: Word64) <$> LBS.toChunks lbs))
81-
unknown -> do
82-
IO.hPutStrLn IO.stderr $ "Unknown method " <> show unknown
83-
IO.exitFailure
84-
85-
runCreateIndexSimple :: Z.CreateIndexOptions -> IO ()
86-
runCreateIndexSimple opts = do
28+
runCreateIndex :: Z.CreateIndexOptions -> IO ()
29+
runCreateIndex opts = do
8730
let filePath = opts ^. the @"filePath"
8831
let outputIbFile = opts ^. the @"outputIbFile" & fromMaybe (filePath <> ".ib.idx")
8932
let outputBpFile = opts ^. the @"outputBpFile" & fromMaybe (filePath <> ".bp.idx")
@@ -93,12 +36,6 @@ runCreateIndexSimple opts = do
9336
LBS.writeFile outputIbFile (LBS.toLazyByteString ibs)
9437
LBS.writeFile outputBpFile (LBS.toLazyByteString bps)
9538

96-
runCreateIndex :: Z.CreateIndexOptions -> IO ()
97-
runCreateIndex opts = case opts ^. the @"backend" of
98-
"standard" -> runCreateIndexStandard opts
99-
"simple" -> runCreateIndexSimple opts
100-
unknown -> IO.hPutStrLn IO.stderr $ "Unknown backend " <> show unknown
101-
10239
optsCreateIndex :: Parser Z.CreateIndexOptions
10340
optsCreateIndex = Z.CreateIndexOptions
10441
<$> strOption
@@ -114,13 +51,6 @@ optsCreateIndex = Z.CreateIndexOptions
11451
<> help "Backend for creating index"
11552
<> metavar "STRING"
11653
)
117-
<*> strOption
118-
( long "method"
119-
<> short 'm'
120-
<> value "original"
121-
<> help "Method for creating index"
122-
<> metavar "STRING"
123-
)
12454
<*> optional
12555
( strOption
12656
( long "output-ib-file"

0 commit comments

Comments
 (0)