Skip to content

Commit

Permalink
Improves common subexpression elimination
Browse files Browse the repository at this point in the history
* Memory overhead is now constant and no longer linear
* Added upper limit for distance between instructions to combine
* Added command line option for distance limit
* Enabled CSE as optimization for -O3
  • Loading branch information
doe300 committed Dec 15, 2018
1 parent de37294 commit eedc6bd
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 55 deletions.
7 changes: 7 additions & 0 deletions include/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ namespace vc4c
* if there are two optimizations which reverse each others changes.
*/
unsigned maxOptimizationIterations = 512;

/*
* Maximum distance between two instructions to be combined for the common subexpression optimization.
*
* NOTE: Setting this to a large value might lead to very long compilation times.
*/
unsigned maxCommonExpressionDinstance = 64;
};

/*
Expand Down
19 changes: 15 additions & 4 deletions src/Expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ Expression Expression::combineWith(const FastMap<const Local*, Expression>& inpu

if(code.numOperands == 1 && expr0 != nullptr)
{
if(code.isIdempotent() && expr0->code == code)
// f(f(a)) = f(a)
return Expression{code, expr0->arg0, expr1->arg1, UNPACK_NOP, PACK_NOP, add_flag(deco, expr0->deco)};
if(code == OP_FTOI && expr0->code == OP_ITOF)
// ftoi(itof(i)) = i
return Expression{OP_V8MIN, expr0->arg0, NO_VALUE, UNPACK_NOP, PACK_NOP, add_flag(deco, expr0->deco)};
Expand Down Expand Up @@ -104,12 +107,20 @@ Expression Expression::combineWith(const FastMap<const Local*, Expression>& inpu

if(code.numOperands == 2)
{
if(code == OP_FADD && ((expr0 && expr0->code == OP_FADD) || (expr1 && expr1->code == OP_FADD)))
{
// TODO
}
if(code.isIdempotent() && arg0 == arg1)
// f(a, a) = a
return Expression{OP_V8MIN, arg0, arg0, UNPACK_NOP, PACK_NOP, deco};
if(OpCode::getLeftIdentity(code) == arg0)
return Expression{OP_V8MIN, arg1.value(), arg1, UNPACK_NOP, PACK_NOP, deco};
if(OpCode::getRightIdentity(code) == arg1)
return Expression{OP_V8MIN, arg0, arg0, UNPACK_NOP, PACK_NOP, deco};
if(OpCode::getLeftAbsorbingElement(code) == arg0)
return Expression{OP_V8MIN, arg0, arg0, UNPACK_NOP, PACK_NOP, deco};
if(OpCode::getRightAbsorbingElement(code) == arg1)
return Expression{OP_V8MIN, arg1.value(), arg1, UNPACK_NOP, PACK_NOP, deco};

// TODO also combine things like (a * 4) + a = a * 5 or (a * 4) - a = a * 3
// TODO can use associative, commutative properties?
}

return *this;
Expand Down
12 changes: 6 additions & 6 deletions src/Expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ namespace vc4c
*/
struct Expression
{
const OpCode code;
const Value arg0;
const Optional<Value> arg1;
const Unpack unpackMode = UNPACK_NOP;
const Pack packMode = PACK_NOP;
const intermediate::InstructionDecorations deco;
OpCode code;
Value arg0;
Optional<Value> arg1;
Unpack unpackMode = UNPACK_NOP;
Pack packMode = PACK_NOP;
intermediate::InstructionDecorations deco;

static Optional<Expression> createExpression(const intermediate::IntermediateInstruction& instr);

Expand Down
35 changes: 29 additions & 6 deletions src/analysis/AvailableExpressionAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,33 @@ using namespace vc4c;
using namespace vc4c::analysis;

AvailableExpressionAnalysis::AvailableExpressionAnalysis() :
LocalAnalysis(AvailableExpressionAnalysis::analyzeAvailableExpressions, AvailableExpressionAnalysis::to_string)
LocalAnalysis(
AvailableExpressionAnalysis::analyzeAvailableExpressionsWrapper, AvailableExpressionAnalysis::to_string)
{
}

AvailableExpressions AvailableExpressionAnalysis::analyzeAvailableExpressions(
std::pair<AvailableExpressions, Optional<Expression>> AvailableExpressionAnalysis::analyzeAvailableExpressions(
const intermediate::IntermediateInstruction* instr, const AvailableExpressions& previousExpressions,
FastMap<const Local*, FastSet<const Expression*>>& cache)
FastMap<const Local*, FastSet<const Expression*>>& cache, unsigned maxExpressionDistance)
{
PROFILE_START(AvailableExpressionAnalysis);
AvailableExpressions newExpressions(previousExpressions);
auto it = newExpressions.begin();
while(it != newExpressions.end())
{
if(it->second.second >= maxExpressionDistance)
{
// remove all "older" expressions, since we do not care for them anymore
it = newExpressions.erase(it);
}
else
{
++it->second.second;
++it;
}
}

Optional<Expression> expr;
if(instr->hasValueType(ValueType::LOCAL))
{
// re-set all expressions using the local written to as input
Expand All @@ -34,11 +50,11 @@ AvailableExpressions AvailableExpressionAnalysis::analyzeAvailableExpressions(
for(const auto& expr : cacheIt->second)
newExpressions.erase(*expr);
}
auto expr = Expression::createExpression(*instr);
expr = Expression::createExpression(*instr);
if(expr)
{
// only adds if expression is not already in there
auto it = newExpressions.emplace(expr.value(), instr);
auto it = newExpressions.emplace(expr.value(), std::make_pair(instr, 0));
if(it.second)
{
// add map from input locals to expression (if we really inserted an expression)
Expand All @@ -51,7 +67,14 @@ AvailableExpressions AvailableExpressionAnalysis::analyzeAvailableExpressions(
}
}
PROFILE_END(AvailableExpressionAnalysis);
return newExpressions;
return std::make_pair(std::move(newExpressions), std::move(expr));
}

AvailableExpressions AvailableExpressionAnalysis::analyzeAvailableExpressionsWrapper(
const intermediate::IntermediateInstruction* instr, const AvailableExpressions& previousExpressions,
FastMap<const Local*, FastSet<const Expression*>>& cache)
{
return analyzeAvailableExpressions(instr, previousExpressions, cache, std::numeric_limits<unsigned>::max()).first;
}

std::string AvailableExpressionAnalysis::to_string(const AvailableExpressions& expressions)
Expand Down
29 changes: 23 additions & 6 deletions src/analysis/AvailableExpressionAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ namespace vc4c
{
/*
* Maps the available locals and the available expression writing into the given local for a given point in
* the program
* the program code. The additional integer value is the distance in instructions from the current position
* where the expression was written.
*/
using AvailableExpressions = FastMap<Expression, const intermediate::IntermediateInstruction*>;
using AvailableExpressions =
FastMap<Expression, std::pair<const intermediate::IntermediateInstruction*, unsigned>>;

/*
* Analyses the available expressions within a single basic block.
Expand All @@ -35,17 +37,32 @@ namespace vc4c
public:
explicit AvailableExpressionAnalysis();

private:
/*
* For an instruction reading a, b and writing c:
*
* - the available expression for c is re-set to the current instruction
*
* NOTE: Usage of this function directly and dropping of old results is highly recommended over running the
* analysis over the whole block!
*
* Returns the available expressions for the instruction and the expression generated by the instruction, if
* any.
*/
static AvailableExpressions analyzeAvailableExpressions(const intermediate::IntermediateInstruction* instr,
const AvailableExpressions& previousExpressions,
FastMap<const Local*, FastSet<const Expression*>>& cache);
static std::pair<AvailableExpressions, Optional<Expression>> analyzeAvailableExpressions(
const intermediate::IntermediateInstruction* instr, const AvailableExpressions& previousExpressions,
FastMap<const Local*, FastSet<const Expression*>>& cache, unsigned maxExpressionDistance);

static std::string to_string(const AvailableExpressions& expressions);

private:
/*
* For an instruction reading a, b and writing c:
*
* - the available expression for c is re-set to the current instruction
*/
static AvailableExpressions analyzeAvailableExpressionsWrapper(
const intermediate::IntermediateInstruction* instr, const AvailableExpressions& previousExpressions,
FastMap<const Local*, FastSet<const Expression*>>& cache);
};
} /* namespace analysis */
} /* namespace vc4c */
Expand Down
2 changes: 2 additions & 0 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ static void printHelp()
<< "\tThe maximum depth of nested loops to move constants out of" << std::endl;
std::cout << "\t--foptimization-iterations=" << defaultConfig.additionalOptions.maxOptimizationIterations
<< "\tThe maximum number of iterations to repeat the optimizations in" << std::endl;
std::cout << "\t--fcommon-subexpression-threshold=" << defaultConfig.additionalOptions.maxCommonExpressionDinstance
<< "\tThe maximum distance for two common subexpressions to be combined" << std::endl;

std::cout << "options:" << std::endl;
std::cout << "\t--kernel-info\t\tWrite the kernel-info meta-data (as required by VC4CL run-time, default)"
Expand Down
45 changes: 20 additions & 25 deletions src/optimization/Eliminator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,7 @@ bool optimizations::eliminateDeadCode(const Module& module, Method& method, cons
}
if(move != nullptr)
{
if(move->getSource().hasLocal() && move->getOutput()->hasLocal() && !move->hasConditionalExecution() &&
!move->hasPackMode() && !move->hasSideEffects() &&
dynamic_cast<intermediate::VectorRotation*>(move) == nullptr)
if(move->getSource().hasLocal() && move->getOutput()->hasLocal() && move->isSimpleMove())
{
// if for a move, neither the input-local nor the output-local are written to afterwards,
// XXX or the input -local is only written after the last use of the output-local
Expand Down Expand Up @@ -126,7 +124,7 @@ InstructionWalker optimizations::simplifyOperation(
intermediate::MoveOperation* move = it.get<intermediate::MoveOperation>();
if(op != nullptr)
{
if(!op->hasSideEffects() && !op->hasPackMode() && !op->hasUnpackMode())
if(op->isSimpleOperation())
{
// improve by pre-calculating first and second arguments
const Value firstArg =
Expand Down Expand Up @@ -230,8 +228,7 @@ InstructionWalker optimizations::simplifyOperation(
}
else if(move != nullptr)
{
if(move->getSource() == move->getOutput().value() && !move->hasSideEffects() && !move->hasPackMode() &&
!move->hasUnpackMode() && !it.has<intermediate::VectorRotation>())
if(move->getSource() == move->getOutput().value() && move->isSimpleMove())
{
// skip copying to same, if no flags/signals/pack and unpack-modes are set
logging::debug() << "Removing obsolete " << move->to_string() << logging::endl;
Expand Down Expand Up @@ -512,8 +509,7 @@ bool optimizations::eliminateRedundantMoves(const Module& module, Method& method
it.getBasicBlock()->walkEnd()) :
Optional<InstructionWalker>{};

if(!move->hasPackMode() && !move->hasUnpackMode() && move->getSource() == move->getOutput().value() &&
!move->doesSetFlag() && !move->hasSideEffects())
if(move->getSource() == move->getOutput().value() && move->isSimpleMove())
{
if(move->signal == SIGNAL_NONE)
{
Expand Down Expand Up @@ -616,10 +612,10 @@ bool optimizations::eliminateRedundantBitOp(const Module& module, Method& method
auto it = method.walkAllInstructions();
while(!it.isEndOfMethod())
{
if(it.get() && !it->hasSideEffects() && !it->hasPackMode() && !it->hasUnpackMode())
auto op = it.get<intermediate::Operation>();
if(op && op->isSimpleOperation())
{
auto op = it.get<intermediate::Operation>();
if(op && op->op == OP_AND && !op->hasUnpackMode() && !op->hasPackMode())
if(op->op == OP_AND)
{
// and v1, v2, v3 => and v1, v2, v4
// and v4, v1, v2 mov v4, v1
Expand Down Expand Up @@ -658,7 +654,7 @@ bool optimizations::eliminateRedundantBitOp(const Module& module, Method& method
foundAnd(out, arg1.local(), it);
};

if(op && op->op == OP_OR && !op->hasUnpackMode() && !op->hasPackMode())
if(op->op == OP_OR)
{
// or v1, v2, v3 => or v1, v2, v4
// and v4, v1, v2 mov v4, v2
Expand Down Expand Up @@ -714,28 +710,27 @@ bool optimizations::eliminateCommonSubexpressions(const Module& module, Method&
bool replacedSomething = false;
for(auto& block : method)
{
// FIXME leaks/uses huge amount of memory for clpeak integer test (20+ GB!!)
// TODO needs speed/space optimization
// esp. in combination with PropagateMoves and EliminateDeadCode this is very hard-core (e.g. for clpeak integer
// test)
analysis::AvailableExpressionAnalysis analysis;
analysis(block);
// we do not run the whole analysis in front, but only the next step to save on memory usage
// For that purpose, we also override the previous expressions on every step
analysis::AvailableExpressionAnalysis::Cache cache;
analysis::AvailableExpressions expressions;

for(auto it = block.walk(); !it.isEndOfBlock(); it.nextInBlock())
{
if(!it.has())
continue;
auto expr = Expression::createExpression(*it.get());
Optional<Expression> expr;
std::tie(expressions, expr) = analysis::AvailableExpressionAnalysis::analyzeAvailableExpressions(
it.get(), expressions, cache, config.additionalOptions.maxCommonExpressionDinstance);
if(expr)
{
// TODO add some kind of maximum number of instructions (accumulator threshold) to combine over??
auto exprIt = analysis.getResult(it.get()).find(expr.value());
if(exprIt != analysis.getResult(it.get()).end() && exprIt->second != it.get())
auto exprIt = expressions.find(expr.value());
if(exprIt != expressions.end() && exprIt->second.first != it.get())
{
logging::debug() << "Found common subexpression: " << it->to_string() << " is the same as "
<< exprIt->second->to_string() << logging::endl;
it.reset(
new intermediate::MoveOperation(it->getOutput().value(), exprIt->second->getOutput().value()));
<< exprIt->second.first->to_string() << logging::endl;
it.reset(new intermediate::MoveOperation(
it->getOutput().value(), exprIt->second.first->getOutput().value()));
replacedSomething = true;
}
}
Expand Down
4 changes: 1 addition & 3 deletions src/optimization/Optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,6 @@ const std::vector<OptimizationPass> Optimizer::ALL_PASSES = {
OptimizationPass("CombineRotations", "combine-rotations", combineVectorRotations,
"combines duplicate vector rotations, e.g. introduced by vector-shuffle into a single rotation",
OptimizationType::REPEAT),
// XXX not enabled with any optimization level for now
OptimizationPass("CommonSubexpressionElimination", "eliminate-common-subexpressions", eliminateCommonSubexpressions,
"eliminates repetitive calculations of common expressions by re-using previous results (WIP, slow)",
OptimizationType::REPEAT),
Expand Down Expand Up @@ -292,6 +291,7 @@ std::set<std::string> Optimizer::getPasses(OptimizationLevel level)
passes.emplace("extract-loads-from-loops");
passes.emplace("schedule-instructions");
passes.emplace("work-group-cache");
passes.emplace("eliminate-common-subexpressions");
// fall-through on purpose
case OptimizationLevel::MEDIUM:
passes.emplace("merge-blocks");
Expand All @@ -300,8 +300,6 @@ std::set<std::string> Optimizer::getPasses(OptimizationLevel level)
passes.emplace("eliminate-bit-operations");
passes.emplace("copy-propagation");
passes.emplace("combine-loads");
// TODO CSE is disabled, since it can result in long compilation times and very large memory consumption
// passes.emplace("eliminate-common-subexpressions");
// fall-through on purpose
case OptimizationLevel::BASIC:
passes.emplace("reorder-blocks");
Expand Down
12 changes: 7 additions & 5 deletions src/tools/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,17 +174,19 @@ bool tools::parseConfigurationParameter(Configuration& config, const std::string
return false;
}
if(paramName == "combine-load-threshold")
config.additionalOptions.combineLoadThreshold = intValue;
config.additionalOptions.combineLoadThreshold = static_cast<unsigned>(intValue);
else if(paramName == "accumulator-threshold")
config.additionalOptions.accumulatorThreshold = intValue;
config.additionalOptions.accumulatorThreshold = static_cast<unsigned>(intValue);
else if(paramName == "replace-nop-threshold")
config.additionalOptions.replaceNopThreshold = intValue;
config.additionalOptions.replaceNopThreshold = static_cast<unsigned>(intValue);
else if(paramName == "register-resolver-rounds")
config.additionalOptions.registerResolverMaxRounds = intValue;
config.additionalOptions.registerResolverMaxRounds = static_cast<unsigned>(intValue);
else if(paramName == "move-constants-depth")
config.additionalOptions.moveConstantsDepth = intValue;
else if(paramName == "optimization-iterations")
config.additionalOptions.maxOptimizationIterations = intValue;
config.additionalOptions.maxOptimizationIterations = static_cast<unsigned>(intValue);
else if(paramName == "common-subexpression-threshold")
config.additionalOptions.maxCommonExpressionDinstance = static_cast<unsigned>(intValue);
else
{
std::cerr << "Cannot set unknown optimization parameter: " << paramName << " to " << value << std::endl;
Expand Down
1 change: 1 addition & 0 deletions test/TestOptimizations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ TestOptimizations::TestOptimizations() : TestEmulator(true)
TEST_ADD_WITH_STRING(TestOptimizations::testClamp, pass.parameterName);
TEST_ADD_WITH_STRING(TestOptimizations::testCross, pass.parameterName);
}
//TODO the profiling info is wrong, since all optimization counters get merged!
TEST_ADD(TestEmulator::printProfilingInfo);
}

Expand Down

0 comments on commit eedc6bd

Please sign in to comment.