Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
65ad500
add the suffix array function to Arkouda
zhihuidu Nov 19, 2020
7af0b51
add suffix array benchmark sa.py
zhihuidu Nov 19, 2020
d288c10
add read file suffix array function and all libdivsufsort files
zhihuidu Nov 25, 2020
9a22704
change name
zhihuidu Nov 26, 2020
c81d755
add suffix_array Python test
zhihuidu Nov 26, 2020
dbd6d96
add Chapel skew suffix array algorithm
zhihuidu Dec 8, 2020
21df359
add the lcp array method
zhihuidu Dec 8, 2020
940cb79
confirm submit all changes
zhihuidu Dec 9, 2020
9f22f79
add thirdpary files
zhihuidu Dec 14, 2020
3a220dc
solve conflict
zhihuidu Dec 14, 2020
4131820
add thirdparty files
zhihuidu Dec 14, 2020
c9e03fb
updated
dbader13 Dec 14, 2020
d184048
Update SACA.chpl comments
alvaradoo Dec 14, 2020
8f88f4c
Merge pull request #3 from alvaradoo/patch-1
zhihuidu Dec 14, 2020
b6228e5
remove tab, remove unused codes
zhihuidu Dec 15, 2020
42e3ba7
change to relative directory
zhihuidu Dec 17, 2020
9702d46
include sa.py into run_benchmarks.py
zhihuidu Dec 20, 2020
7a0b197
suffix_arry_file updated
zhihuidu Dec 20, 2020
e90ca27
remove tab in MultiTypeSymEntry.chpl
zhihuidu Dec 20, 2020
2db17d8
datatype in string.py
zhihuidu Dec 20, 2020
da30cfa
remove suffixarray_test.py
zhihuidu Dec 20, 2020
6b6e41a
update third party config
zhihuidu Dec 20, 2020
38818a3
follow suggestions from community
zhihuidu Dec 21, 2020
32ecc0c
solve the conflict
zhihuidu Dec 22, 2020
a6c536e
update the SegSArray
zhihuidu Dec 22, 2020
16cca77
remove unused import
zhihuidu Dec 24, 2020
9d60563
align with strings function
zhihuidu Dec 24, 2020
59174ac
correct a typo
zhihuidu Dec 24, 2020
0cda91d
type match
zhihuidu Dec 24, 2020
e1c3173
data type
zhihuidu Dec 24, 2020
3861e62
bool or pdarray
zhihuidu Dec 24, 2020
edc3f63
remove binary op
zhihuidu Dec 24, 2020
6074b60
remove binary op
zhihuidu Dec 24, 2020
4f773e2
remove the bug causing wrong return string value
zhihuidu Dec 26, 2020
267238c
import SArrays class in pdarraysetops.py
zhihuidu Dec 26, 2020
189c32e
add an empty correctness function
zhihuidu Dec 26, 2020
0aa835c
copy master gather.py
zhihuidu Dec 26, 2020
ed98498
make sa.py check easy
zhihuidu Dec 26, 2020
9f5c3d3
check test/*.chpl
zhihuidu Dec 26, 2020
ac7e209
add corectness check in sa.py
zhihuidu Dec 27, 2020
6fd3b05
change suffix array return as an int array
zhihuidu Dec 27, 2020
50be2e3
copy string_test.py
zhihuidu Dec 28, 2020
f1781e8
Fixed bug in UnitTestPeelStick
reuster986 Jan 4, 2021
221679e
update lcp related code
zhihuidu Jan 4, 2021
a8a195a
Merge pull request #4 from reuster986/master
zhihuidu Jan 4, 2021
0bff3e4
remove the enhenced attribute in sym table
zhihuidu Jan 4, 2021
2af5ce8
check the comments to remove docs CI check error
zhihuidu Jan 5, 2021
2d40c0e
solve the sphinx error
zhihuidu Jan 5, 2021
c782b5a
add switch betwteen different SA algorithms
zhihuidu Jan 8, 2021
00b3579
single locales for C code
zhihuidu Jan 8, 2021
80f78c6
resolve convlict
zhihuidu Jan 8, 2021
010a446
return the string for suffix_array_file
zhihuidu Jan 11, 2021
f5ca67a
remove mypy CI check error
zhihuidu Jan 11, 2021
6d65335
tuple data type
zhihuidu Jan 11, 2021
fb89d88
solve conflict of suffix array, SegmentedMsg, SegmentedArray and run…
zhihuidu Mar 15, 2021
18f481c
handle some bytes to string
zhihuidu Mar 15, 2021
924ac94
Merge branch 'master' of github.com:mhmerrill/arkouda
zhihuidu Mar 16, 2021
87c6327
merge with the latest version
zhihuidu Mar 16, 2021
82776f9
solve the inconsistency in dtype
zhihuidu Mar 19, 2021
dda2264
Merge branch 'master' into string-suffix-array-functionality
stress-tess Jun 8, 2021
a543312
Minimum changes to pass current tests (1/2):
Jun 9, 2021
6f8cbfe
Minimum changes to pass current tests (2/2):
Jun 15, 2021
a9167e6
Merge branch 'master' into pr/627
Jun 16, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions arkouda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from arkouda.client import *
from arkouda.dtypes import *
from arkouda.dtypes import npstr as akstr
from arkouda.pdarrayclass import *
from arkouda.sorting import *
from arkouda.pdarraysetops import *
Expand Down
40 changes: 40 additions & 0 deletions arkouda/pdarrayclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,46 @@ def unescape(s):
raise ValueError(("unsupported value from server {} {}".\
format(mydtype.name, value)))




@typechecked
def _parse_single_int_array_value(msg : str) -> object:
"""
Attempt to convert a scalar return value from the arkouda server to a
numpy string in Python. The user should not call this function directly.

Parameters
----------
msg : str
scalar value in string form to be converted to a numpy string

Returns
-------
object numpy scalar
"""
fields = msg.split(" ",1)
dtname, value = msg.split(maxsplit=1)
mydtype = dtype(dtname)
try:
if mydtype == akint64:
nfields = value.split("\"")
# return nfields[1]
# original we return a string include the last ending 0

_,sastr=nfields[1].split(maxsplit=1)
tmpstr=sastr.split()
intary = [int(numeric_string) for numeric_string in tmpstr]
return intary
# now we return a suffix array and not include the last ending 0
else:
raise ValueError(("not correct int data type from server {} {}".\
format(mydtype.name, value)))
except:
raise ValueError(("unsupported value from server {} {}".\
format(mydtype.name, value)))


# class for the pdarray
class pdarray:
"""
Expand Down
141 changes: 137 additions & 4 deletions arkouda/pdarraycreation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@
int_scalars, numeric_scalars
from arkouda.dtypes import dtype as akdtype
from arkouda.pdarrayclass import pdarray, create_pdarray
from arkouda.strings import Strings
from arkouda.strings import Strings, SArrays

__all__ = ["array", "zeros", "ones", "zeros_like", "ones_like",
"arange", "linspace", "randint", "uniform", "standard_normal",
"random_strings_uniform", "random_strings_lognormal",
"from_series"
]
"from_series", "suffix_array","lcp_array","suffix_array_file"]

@typechecked
def from_series(series : pd.Series,
Expand Down Expand Up @@ -607,7 +606,7 @@ def randint(low : numeric_scalars, high : numeric_scalars,

repMsg = generic_msg(cmd='randint', args='{} {} {} {} {}'.\
format(sizestr, dtype.name, lowstr, highstr, seed))
return create_pdarray(repMsg)
return create_pdarray(cast(str,repMsg))

@typechecked
def uniform(size : int_scalars, low : numeric_scalars=float(0.0),
Expand Down Expand Up @@ -821,3 +820,137 @@ def random_strings_lognormal(logmean : numeric_scalars, logstd : numeric_scalars
NUMBER_FORMAT_STRINGS['float64'].format(logstd),
seed))
return Strings(*(cast(str,repMsg).split('+')))



@typechecked
def suffix_array(strings : Strings) -> SArrays:
"""
Return the suffix arrays of given strings. The size/shape of each suffix
arrays is the same as the corresponding strings.
A simple example of suffix array is as follow. Given a string "banana$",
all the suffixes are as follows.
s[0]="banana$"
s[1]="anana$"
s[2]="nana$"
s[3]="ana$"
s[4]="na$"
s[5]="a$"
s[6]="$"
The suffix array of string "banana$" is the array of indices of sorted suffixes.
s[6]="$"
s[5]="a$"
s[3]="ana$"
s[1]="anana$"
s[0]="banana$"
s[4]="na$"
s[2]="nana$"
so sa=[6,5,3,1,0,4,2]

Returns
-------
pdarray
The suffix arrays of the given strings

See Also
--------

Notes
-----

Raises
------
RuntimeError
Raised if there is a server-side error in executing group request or
creating the pdarray encapsulating the return message
"""
cmd= "segmentedSuffixAry"
args = "{} {} {}".format( strings.objtype,
strings.offsets.name,
strings.bytes.name)
repMsg = generic_msg(cmd=cmd,args=args)
return SArrays(*(cast(str,repMsg).split('+')))
Comment on lines +867 to +872
Copy link
Member

@stress-tess stress-tess Jun 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While not the biggest deal, there are a handful of place where spacing/formatting differs from the rest of the arkouda codebase. This is a bit of nitpick but I think having consistency is important for readability/maintainability

I would likely update this representative example to look something like this

cmd = "segmentedSuffixAry"
args = "{} {} {}".format(strings.objtype, strings.offsets.name, strings.bytes.name)
repMsg = generic_msg(cmd=cmd, args=args)
return SArrays(*(cast(str, repMsg).split("+")))



@typechecked
def lcp_array(suffixarrays : SArrays, strings : Strings) -> SArrays:
"""
Return the longest common prefix of given suffix arrays. The size/shape of each lcp
arrays is the same as the corresponding suffix array.
-------
SArrays
The LCP arrays of the given suffix arrays

See Also
--------

Notes
-----

Raises
------
RuntimeError
Raised if there is a server-side error in executing group request or
creating the pdarray encapsulating the return message
"""
cmd = "segmentedLCP"
args = "{} {} {} {} {}".format( suffixarrays.objtype,
suffixarrays.offsets.name,
suffixarrays.bytes.name,
strings.offsets.name,
strings.bytes.name)
repMsg = generic_msg(cmd=cmd,args=args)
return SArrays(*(cast(str,repMsg).split('+')))

@typechecked
def suffix_array_file(filename: str) -> tuple:
#def suffix_array_file(filename: str) -> tuple[SArrays,Strings]:
"""
This function is major used for testing correctness and performance
Return the suffix array of given file name's content as a string.
A simple example of suffix array is as follow. Given string "banana$",
all the suffixes are as follows.
s[0]="banana$"
s[1]="anana$"
s[2]="nana$"
s[3]="ana$"
s[4]="na$"
s[5]="a$"
s[6]="$"
The suffix array of string "banana$" is the array of indices of sorted suffixes.
s[6]="$"
s[5]="a$"
s[3]="ana$"
s[1]="anana$"
s[0]="banana$"
s[4]="na$"
s[2]="nana$"
so sa=[6,5,3,1,0,4,2]

Returns
-------
pdarray
The suffix arrays of the given strings

See Also
--------

Notes
-----

Raises
------
RuntimeError
Raised if there is a server-side error in executing group request or
creating the pdarray encapsulating the return message
"""
cmd = "segmentedSAFile"
args = "{}".format( filename )
repMsg = generic_msg(cmd=cmd,args=args)
tmpmsg=cast(str,repMsg).split('+')
sastr=tmpmsg[0:2]
strstr=tmpmsg[2:4]
suffixarray=SArrays(*(cast(str,sastr)))
originalstr=Strings(*(cast(str,strstr)))
return suffixarray,originalstr
# return SArrays(*(cast(str,repMsg).split('+')))
79 changes: 78 additions & 1 deletion arkouda/pdarraysetops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from arkouda.pdarrayclass import pdarray, create_pdarray
from arkouda.pdarraycreation import zeros, zeros_like, array
from arkouda.sorting import argsort
from arkouda.strings import Strings
from arkouda.strings import Strings,SArrays
from arkouda.logger import getArkoudaLogger

Categorical = ForwardRef('Categorical')
Expand Down Expand Up @@ -168,6 +168,83 @@ def in1d(pda1 : Union[pdarray,Strings,'Categorical'], pda2 : Union[pdarray,Strin
else:
raise TypeError('Both pda1 and pda2 must be pdarray, Strings, or Categorical')



def in1d_int(pda1 : Union[pdarray,SArrays,'Categorical'], pda2 : Union[pdarray,SArrays,'Categorical'], #type: ignore
invert : bool=False) -> pdarray: #type: ignore
"""
Test whether each element of a 1-D array is also present in a second array.

Returns a boolean array the same length as `pda1` that is True
where an element of `pda1` is in `pda2` and False otherwise.

Parameters
----------
pda1 : pdarray or SArrays or Categorical
Input array.
pda2 : pdarray or SArrays or Categorical
The values against which to test each value of `pda1`. Must be the
same type as `pda1`.
invert : bool, optional
If True, the values in the returned array are inverted (that is,
False where an element of `pda1` is in `pda2` and True otherwise).
Default is False. ``ak.in1d(a, b, invert=True)`` is equivalent
to (but is faster than) ``~ak.in1d(a, b)``.

Returns
-------
pdarray, bool
The values `pda1[in1d]` are in `pda2`.

Raises
------
TypeError
Raised if either pda1 or pda2 is not a pdarray, Strings, or
Categorical object or if invert is not a bool
RuntimeError
Raised if the dtype of either array is not supported

See Also
--------
unique, intersect1d, union1d

Notes
-----
`in1d` can be considered as an element-wise function version of the
python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is logically
equivalent to ``ak.array([item in b for item in a])``, but is much
faster and scales to arbitrarily large ``a``.

ak.in1d is not supported for bool or float64 pdarrays

Examples
--------
>>> ak.in1d(ak.array([-1, 0, 1]), ak.array([-2, 0, 2]))
array([False, True, False])

>>> ak.in1d(ak.array(['one','two']),ak.array(['two', 'three','four','five']))
array([False, True])
"""
from arkouda.categorical import Categorical as Categorical_
if hasattr(pda1, 'in1d'):
return cast(Categorical_,pda1).in1d(pda2)
elif isinstance(pda1, pdarray) and isinstance(pda2, pdarray):
repMsg = generic_msg("in1d {} {} {}".\
format(pda1.name, pda2.name, invert))
return create_pdarray(cast(str,repMsg))
elif isinstance(pda1, SArrays) and isinstance(pda2, SArrays):
repMsg = generic_msg("segmentedIn1dInt {} {} {} {} {} {} {}".\
format(pda1.objtype,
pda1.offsets.name,
pda1.bytes.name,
pda2.objtype,
pda2.offsets.name,
pda2.bytes.name,
invert))
return create_pdarray(cast(str,repMsg))
else:
raise TypeError('Both pda1 and pda2 must be pdarray, SArrays, or Categorical')

@typechecked
def concatenate(arrays : Sequence[Union[pdarray,Strings,'Categorical']], #type: ignore
ordered : bool=True) -> Union[pdarray,Strings,'Categorical']: #type: ignore
Expand Down
Loading