-
Notifications
You must be signed in to change notification settings - Fork 0
aPandaStepAhead
import pandas
# import numpy # for (4), but this is inefficient anyway
tableColumns = ["audioCodec", "displayHeight", "testName"]
testFrame = pandas.DataFrame(columns=tableColumns)
print empty frame (only column names)
print(testFrame)
testFrame.loc[len(testFrame.index)] = [1,2,3]
i = 0
for i in range(20):
testFrame.at[len(testFrame)] = [i, i+1, i+2]
testFrame.at[len(testFrame)] = [i, i+1, "df"]
testFrame.at[len(testFrame)] = [chr(i+266), i+1, i+2]
newData = [4,5,6]
testFrame = testFrame.append(pandas.Series(newData), ignore_index=True)
This is very inefficient
newLine = pandas.DataFrame([1,2,3],None, columns=tableColumns)
newLine = pandas.DataFrame(numpy.array([1,2,3]), columns=tableColumns)
testFrame.append(newLine, ignore_index=True)
myDataFrame["diff_A_B"] = datfr["A"] - otherData["B"] # add columns `diff_A_B` from columns of two other `pandas.DataFrames`
testFrame = testFrame.T
dataFrame = dataFrame.drop("columnName", 1) # 0: rows, 1 columns
Note: ["cake", "alcohol"]
is a list.
df1 = df[["cake", "alcohol"]] # keep only "cake" and "alcohol" in df1
BUT: df[["a","b"]]
produces a copy :(
frames = [df1, df2, df3]
result = pd.concat(frames) # add list of dataframes you want to concatenate
or like this:
result = pd.concat([df1, df4], ignore_index=True) # `ignore_index`: discards initial counting >> if `True` count is number of rows -1
Here we want to merge on "configID" and "memType" (on
argument is not nesessary here since the default searches for matching columns):
# Make sure the indices are reset for both dataframes (reset with `df = df.reset_index()`)
groupedByMemTypeAcc = pandas.merge(groupedByMemTypeAcc, budgetsData, on=["configID", "memType"])
groupedByMemTypeAcc = groupedByMemTypeAcc.groupby(["configID", "memType"]).sum()
print(groupedByMemTypeAcc)
Input:
# groupedByMemTypeAcc
configID memType sizeDec
0 micro1 INT_FLASH 17196646
1 micro1 INT_RAM 15302
2 micro2 EXT_FLASH 44667659878
3 micro2 EXT_RAM 386547056
4 micro2 INT_RAM 3276
# budgetData
configID memType budget
0 micro1 INT_FLASH 20971520
1 micro1 INT_RAM 16384
2 micro1 EXT_FLASH 0
3 micro1 EXT_RAM 0
4 micro2 EXT_FLASH 68719476736
5 micro2 EXT_RAM 536870912
6 micro2 INT_RAM 65536
7 micro2 INT_FLASH 0
Output:
sizeDec budget
configID memType
micro1 INT_FLASH 17196646 20971520
INT_RAM 15302 16384
micro2 EXT_FLASH 44667659878 68719476736
EXT_RAM 386547056 536870912
INT_RAM 3276 65536
dataFrame = dataFrame.reset_index()
dataFrame.loc["partyHard"].replace(to_replace="nerds", value="hardcorePartyPeople")
dataFrame.iloc[1, 1].replace(to_replace="nerds", value="hardcorePartyPeople") # .iloc[columns, row]
Get only rows where values above 0 in "A" exists.
The operators are: |
for or, &
for and, and ~
for not. These must be grouped by using parentheses (e.g. s[(s < -1) | (s > 0.5)]
)
df = df[df["A"] > 0]
print(df)
# one two
# 0 1 2
# 1 3 4
# 2 5 6
# 3 7 8
df.set_index("two")
searchedValue = 6.6
# Get nearest index of column `two`
idx = testFrame.index[testFrame.index.get_loc(searchedValue, method='nearest')]
# Print nearest index of `searchedValue` at column two
print("Closest match:", df["two"][idx])
# Closest match: 8
grouped = dataFrame.groupby("Company")
Here column "Company"
becomes automatically the index (before it was the internal numbering >> see index_col=0
in section ".csv
files". Avoid this by appending .reset_index()
.
Groups by CompanyName
and Employees
and sums everything up.
groupedByMemType = groupedByMemType.groupby(["CompanyName", "Employees"]).sum()
Apply the median for every group item
dbAggregated = grouped.aggregate(numpy.median)
for key, item in grouped:
# do stuff with key & item
timeit df.shape
-> 1000000 loops, best of 3: 1.17 us per loop
timeit df[0].count()
-> 10000 loops, best of 3: 56 us per loop
timeit len(df.index)
-> 1000000 loops, best of 3: 381 ns per loop >> most intuitive and fastest :)
-> len(df.index) and df[0].count() are not interchangeable: count excludes NaNs, which is probably explains why it is slower
We want to achieve the following (A
, B
: input data; dA
: vertical delta):
A B dA
0 a b (a-c)
1 c d (c-e)
2 e f (e-g)
3 g h Nan
You can use .shift
to shift a whole column by one (or more) places.
df['dA'] = df['A'] - df['A'].shift(-1)
Gets column "video"
(containing paths), split by backslashes (note: you need .str
for pandas.DataFrame
string
modifications) and take last element (= file name):
self.dataFrame["video"] = self.dataFrame["video"].str.split("\\").str[-1]
...
testFrame.to_csv("blah.csv")
anotherFrame = pandas.read_csv("blah.csv", index_col=0) # say that column[0] should be use for internal indexing (otherwise a new one is added)
This format option only works for float
s but not for int
s
pandas.options.display.float_format = '{:14,.0f}'.format
groupedByMemType["sizeDec"] = groupedByMemType["sizeDec"].astype(float) # Change data type of one column (do this only if necessary)
print(groupedByMemType)
Print the data type of one pandas.DataFrame
-column:
print(groupedByMemType["sizeDec"].dtype)
If index is set: x=index; you can use .reset_index()
or .set_index("desiredColumn")
Essentially pandas
uses matplotlib
import pandas
import matplotlib
import matplotlib.pyplot as plt # needs to be imported explicitely
import numpy
# ...do stuff
dataFrame = ...
matplotlib.style.use('ggplot') # use nice style for plotting
dataFrame.plot() # plots everything in dataframe x=index
dataFrame.plot() # plots everything in dataframe x=index
dataFrame.plot.area() # do area plot
dataFrame.plot(kind=area) # do area plot
plt.show() # show plot
dataFrame.plot()
anotherFrame.plot()
# ...
plt.show() # or save
- Must be same type of plot
- Plots have to have the same lenth (in x direction)
plot = anotherFrame.plot() # Data to plot
matplotlib.pyplot.savefig("fileName.png", dpi=200) # dpi: optional
myFigure = plot.get_figure()
myFigure.savefig("myBeautifulFigure.png", # Rest is optional
dpi=300,
transparent=True # Make background colour transparent
)
-> https://stackoverflow.com/questions/7986567/matplotlib-how-to-set-the-current-figure
Example data to plot
import matplotlib
import matplotlib.pyplot as plt
# Plot bar graph
barGraph = groupedByMemTypeAcc[[headerUsed, headerAvailable]].plot.bar(stacked=True, figsize=(18, 7), rot=45, title=titleStr)
figWidth = 20
figHeight = 10
plt.rcParams['figure.figsize'] = [figWidth, figHeight]
plt.subplots_adjust(top=0.9, bottom=0.26, left=0.06, right=0.98) # Adjust space around our plot
plt.legend(loc='lower right', bbox_to_anchor=(0.98, -0.4)) # Placing legend below the plot (bottom right corner)
Available line styles: https://matplotlib.org/gallery/lines_bars_and_markers/line_styles_reference.html
df.plot(kind='line', style=['-', '--', '-.'])
# `kind` can be omitted
# `style`: one list item per column (do not count index)
# You can prefix the line patterns by specific colours like: `df.plot(kind='line', style=['b-', 'y--', 'g-.'])`
# See also: https://matplotlib.org/users/colors.html
# Ajusting data points in plots is similar: `df.plot(kind='line', style=['bo-', 'ys--', 'g^-.'])`
# circles-----^ ^-squares ^--triangles
# See also: https://stackoverflow.com/a/42820790/4773274
Since we have in this case a stacked bar graph we need to use an if
-statement in order to treat the two bar types seperately
for i, bar in enumerate(barGraph.patches):
if i >= len(barGraph.patches) / 2:
# Show budgets in kiB
barGraph.annotate(
"{:.0f} kiB".format(groupedByMemTypeAcc["budget"][i % (len(barGraph.patches) / 2)] / 1024),
xy=(bar.get_x(), (bar.get_height() + barGraph.patches[int(i % (len(barGraph.patches) / 2))].get_height())),
color="#030303")
else:
# Show percentage and value in kiB
barGraph.annotate(
"{:.1f} %\n{:.0f} kiB".format(bar.get_height(), groupedByMemTypeAcc["sizeDec"][i] / 1024),
xy=(bar.get_x(), bar.get_height() + 0.01),
color="#f2ece9") # see: https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.annotate.html#matplotlib.axes.Axes.annotate
Often for bar.get_height() + 0.01
a multiplication is used by some guys but that did not work for me since it accumulates the values somehow (I don't know what this happens).
dbAgg.plot.scatter(x="sec", y="averageTwoPercentQPnonI", c="bitstream_stats", s=100)
- s: dotsize
- c: if column name: adds another scale with colours the points according to its values
TODO: how to change colour of c
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License *.
Code (snippets) are licensed under a MIT License *.
* Unless stated otherwise