11package twg2 .jbcm ;
22
3+ import java .util .ArrayList ;
4+ import java .util .Collections ;
5+ import java .util .List ;
6+ import java .util .concurrent .atomic .AtomicInteger ;
7+
38import twg2 .collections .primitiveCollections .IntArrayList ;
49import twg2 .collections .primitiveCollections .IntListReadOnly ;
510import twg2 .jbcm .Opcodes .Type ;
11+ import twg2 .jbcm .ir .JumpConditionInfo ;
12+ import twg2 .jbcm .ir .JumpConditionInfo .UsageHint ;
613
714/** Trace all possible paths through the code in a method. A code flow follows jump, branch/condition, return, and throw instructions.
815 * Circular paths end at the first jump/branch destination which already exists in the code flow.
916 * @author TeamworkGuy2
1017 * @since 2020-12-03
1118 */
1219public class CodeFlow {
20+ /** The size of a GOTO instruction, 1 byte opcode + 2 byte operand */
21+ public static final int GOTO_SIZE = 3 ;
22+
1323
1424 /** Starting at a given point in a bytecode array, follow code jumps and branches to all termination (return/throw) points potentially reachable from the starting point
1525 * @param idx the starting point
@@ -20,22 +30,32 @@ public class CodeFlow {
2030 * and can easily be converted back by negating them again. This differentiates non-terminal indexes from all
2131 * valid terminal indexes because valid code indexes cannot be less than 0.
2232 */
23- public static IntArrayList getFlowPaths (int idx , byte [] instr , IntArrayList dstPath ) {
24- for (int i = idx , size = instr .length ; i < size ; i ++) {
25- Opcodes opc = Opcodes .get (instr [i ] & 0xFF );
33+ public static IntArrayList getFlowPaths (byte [] code , int idx ) {
34+ var dstPath = new IntArrayList ();
35+ getFlowPaths (code , idx , code .length , dstPath , 0 );
36+ return dstPath ;
37+ }
38+
39+
40+ public static int getFlowPaths (byte [] code , int idx , int max , IntArrayList dstPath , int pathJumps ) {
41+ for (int i = idx ; i < max ; i ++) {
42+ Opcodes opc = Opcodes .get (code [i ]);
2643 int numOperands = opc .getOperandCount ();
2744
2845 // Type.JUMP instruction set includes all Type.CONDITION instructions
2946 if (opc .hasBehavior (Type .JUMP )) {
30- // follow the jump path if it has not already been followed (to avoid loops)
31- if (! dstPath .contains (~i )) {
32- dstPath . add (~ i ) ;
47+ // skip the jump path if it has already been followed and this is the beginning (to avoid loops)
48+ if (dstPath .contains (~i ) && pathJumps == 0 ) {
49+ break ;
3350 }
34- int jumpDst = opc .getJumpDestination (instr , i );
51+ dstPath .add (~i );
52+ pathJumps ++;
53+ int jumpDst = opc .getJumpDestination (code , i );
3554 if (jumpDst < 0 ) {
36- jumpDst = opc .getJumpDestination (instr , i );
55+ jumpDst = opc .getJumpDestination (code , i );
3756 }
38- getFlowPaths (jumpDst , instr , dstPath );
57+ int subPathJumps = getFlowPaths (code , jumpDst , max , dstPath , pathJumps );
58+ pathJumps = subPathJumps ;
3959
4060 // end this code path if the jump path is unconditional (i.e. GOTO or JSR)
4161 if (!opc .hasBehavior (Type .CONDITION )) {
@@ -45,13 +65,224 @@ public static IntArrayList getFlowPaths(int idx, byte[] instr, IntArrayList dstP
4565 // end this code flow path once a terminal instruction is reached
4666 else if (opc .hasBehavior (Type .RETURN ) || opc == Opcodes .ATHROW ) {
4767 dstPath .add (i );
68+ pathJumps = 0 ;
4869 break ;
4970 }
5071
5172 i += (numOperands < 0 ? 0 : numOperands );
5273 }
5374
54- return dstPath ;
75+ return pathJumps ;
76+ }
77+
78+
79+ /**
80+ * @param code the code array
81+ * @param offset the offset into the code array at which to start finding instructions
82+ * @param length the number of bytes of the code array to check through
83+ * @return
84+ */
85+ public static List <JumpConditionInfo > findFlowConditions (byte [] code , int offset , int length ) {
86+ var conditions = new ArrayList <JumpConditionInfo >(); // track GOTO/IF_* loops detected in the code
87+
88+ // BYTECODE LOOP:
89+ for (int i = offset , size = offset + length ; i < size ; i ++) {
90+ Opcodes opc = Opcodes .get (code [i ]);
91+ int numOperands = opc .getOperandCount ();
92+ // Special handling for instructions with unpredictable byte code lengths
93+ if (numOperands == Opcodes .Const .UNPREDICTABLE ) {
94+ if (Opcodes .WIDE .is (code [i ])) {
95+ i ++; // WIDE opcodes are nested around other operations
96+ opc = Opcodes .get (code [i ]);
97+ numOperands = opc .getOperandCount () * 2 ; // WIDE opcodes double the operands of the widened opcode
98+ }
99+ else if (Opcodes .TABLESWITCH .is (code [i ])) {
100+ throw new IllegalStateException ("tableswitch code handling not implemented" );
101+ }
102+ else if (Opcodes .LOOKUPSWITCH .is (code [i ])) {
103+ throw new IllegalStateException ("lookupswitch code handling not implemented" );
104+ }
105+ }
106+ int jumpRelative = CodeUtility .loadOperands (numOperands , code , i );
107+
108+ // form 1: [..., GOTO <setup_if[0]>, instructions[], setup_if[], IF_* <instructions[0]>, ...] - for()/while() forward GOTO, condition after loop with backward jump
109+ // form 2: [..., setup_if[], IF_* <after[0]>, instructions[], GOTO <setup_if[0]>, after[], ...] - for()/while() condition before loop with forward jump, backward GOTO
110+ // form 3: [..., instructions[], setup_if[], IF_* <instructions[0]>, after[], ...] - do{}while() condition after loop with backward jump
111+ var isJump = opc .hasBehavior (Opcodes .Type .JUMP );
112+ // backward jump, required for a loop (thought experiment: create a loop, using Java bytecodes, that does not jump backward)
113+ // although a code obfuscator could re-arrange code and include backward jumps so not all backward jumps are loops
114+ if (isJump && jumpRelative < 0 ) {
115+ conditions .add (JumpConditionInfo .loadConditionFlow (opc , i , jumpRelative , code , UsageHint .FOR_OR_WHILE_LOOP ));
116+ // 'for' or 'while' loop has to evaluate the condition first so it needs an IF or GOTO at the beginning
117+ // 'do-while' loop evaluates condition after loop runs once, only compiled form seen so far is: no GOTO and one backward jump at the end
118+ }
119+ else if (opc .hasBehavior (Opcodes .Type .CONDITION )) {
120+ conditions .add (JumpConditionInfo .loadConditionFlow (opc , i , jumpRelative , code , UsageHint .IF ));
121+ }
122+ i += (numOperands < 0 ) ? 0 : numOperands ;
123+ }
124+
125+ Collections .sort (conditions , JumpConditionInfo .LOWER_INDEX_SORTER );
126+
127+ // post processing - convert special cases
128+ for (int i = 0 , size = conditions .size (); i < size ; i ++) {
129+ var loop = conditions .get (i );
130+ // find and convert if-conditions that may have been miss-identified as loops
131+ // case: an if-statement inside a loop where there are no instructions after the if-statement and before the
132+ // end of the loop may be compiled as a condition with a backward jump and thus look like a loop, we can tell
133+ // in the case when it shares the same jump destination as the closest parent loop that contains it
134+ // form: [..., loop_start, instructions[], setup_if[], IF_* <loop_start>, instructions_in_if[], loop_end, ...]
135+ if (loop .targetOffset < 0 ) {
136+ var targetIndex = loop .getTargetIndex ();
137+ var loopUpperIndex = loop .getUpperIndex ();
138+ // look at conditions beyond the current one since they are later in the code or contained within the
139+ // current loop and a nested if-statement is contained within the nearest parent loop
140+ for (int j = i + 1 ; j < size ; j ++) {
141+ var loopJ = conditions .get (j );
142+ if (loopJ .opcIdx > loopUpperIndex ) {
143+ break ; // skip remaining conditions once we're past beyond the bounds of the current one
144+ }
145+ if (loopJ .targetOffset < 0 && targetIndex == loopJ .getTargetIndex () && containsIndex (loop , loopJ .opcIdx )) {
146+ // TODO debugging
147+ System .out .println ("converted loop to nested IF-within-loop at " + loopJ .opcIdx + " (" + loopJ .opc + ") contained in " + loop + " to " + targetIndex );
148+
149+ conditions .set (j , loopJ .withLoopEndIndexForIf (loopUpperIndex ));
150+ }
151+ }
152+ }
153+
154+ // set the potential-if-index of loops
155+ if (UsageHint .isLoop (loop .usageHint ) && loop .potentialIfIndex < 0 ) {
156+ var loopConditionIdx = findFirstIfConditionPointingToEndOf (conditions , i );
157+
158+ if (loopConditionIdx >= 0 ) {
159+ loop = loop .withPotentialIfIndex (conditions .get (loopConditionIdx ).opcIdx );
160+ conditions .set (i , loop );
161+
162+ // TODO debugging
163+ System .out .println ("converted if index for loop: " + loop + " found IF " + (loopConditionIdx >= 0 ? conditions .get (loopConditionIdx ) : "-1" ));
164+
165+ conditions .remove (loopConditionIdx );
166+ size --;
167+ if (loopConditionIdx <= i ) {
168+ i --;
169+ }
170+ }
171+ }
172+ }
173+
174+ return conditions ;
175+ }
176+
177+
178+ /** Find the first IF* condition that is contained within the condition located at {@code startIdx} in the {@code conditions} list.
179+ * @param conditions list of conditions, should include all IF* and GOTO instructions in the code,
180+ * sorted based on {@link JumpConditionInfo#getLowerIndex()}
181+ * @param startIdx the index into the {@code conditions} list of the condition to find an IF* condition within
182+ * @return the {@code conditions} index of the first matching IF* condition, else -1 if none is found
183+ */
184+ public static int findFirstIfConditionPointingToEndOf (List <JumpConditionInfo > conditions , int startIdx ) {
185+ var withinThis = conditions .get (startIdx );
186+ int maxIdx = withinThis .getUpperIndex ();
187+ int lowestOpcIdxFound = Integer .MAX_VALUE ;
188+ int lowestOpcIdxI = -1 ;
189+
190+ for (int i = startIdx + 1 , size = conditions .size (); i < size ; i ++) {
191+ var cond = conditions .get (i );
192+ // stop once the condition isn't contained within the target condition, we can safely break because the loops are sorted by lower bound index
193+ if (cond .getLowerIndex () > maxIdx ) {
194+ break ;
195+ }
196+ if (cond != withinThis && cond .opcIdx < lowestOpcIdxFound && containsIfAndEndsWith (withinThis , cond )) {
197+ lowestOpcIdxFound = cond .opcIdx ;
198+ lowestOpcIdxI = i ;
199+ }
200+ }
201+ return lowestOpcIdxI ;
202+ }
203+
204+
205+ public static boolean containsJumpTo (byte [] code , int offset , int length , int targetIndex ) {
206+ // BYTECODE LOOP:
207+ for (int i = offset , size = offset + length ; i < size ; i ++) {
208+ Opcodes opc = Opcodes .get (code [i ]);
209+ int numOperands = opc .getOperandCount ();
210+ // Special handling for instructions with unpredictable byte code lengths
211+ if (numOperands == Opcodes .Const .UNPREDICTABLE ) {
212+ if (Opcodes .WIDE .is (code [i ])) {
213+ i ++; // WIDE opcodes are nested around other operations
214+ opc = Opcodes .get (code [i ]);
215+ numOperands = opc .getOperandCount () * 2 ; // WIDE opcodes double the operands of the widened opcode
216+ }
217+ else if (Opcodes .TABLESWITCH .is (code [i ])) {
218+ throw new IllegalStateException ("tableswitch code handling not implemented" );
219+ }
220+ else if (Opcodes .LOOKUPSWITCH .is (code [i ])) {
221+ throw new IllegalStateException ("lookupswitch code handling not implemented" );
222+ }
223+ }
224+ if (opc .hasBehavior (Opcodes .Type .JUMP )) {
225+ int jumpRelative = CodeUtility .loadOperands (numOperands , code , i );
226+ if (i + jumpRelative == targetIndex ) {
227+ return true ;
228+ }
229+ }
230+
231+ i += (numOperands < 0 ) ? 0 : numOperands ;
232+ }
233+ return false ;
234+ }
235+
236+
237+ public static int findLastOpcodeIndex (byte [] instr , int start , int end ) {
238+ AtomicInteger lastIdx = new AtomicInteger (-1 );
239+ CodeUtility .forEach (instr , start , end - start , (opc , instrs , idx ) -> {
240+ lastIdx .set (idx );
241+ });
242+ return lastIdx .get ();
243+ }
244+
245+
246+ public static int findContainsIfIndex (List <JumpConditionInfo > loops , int index ) {
247+ for (int i = 0 , size = loops .size (); i < size ; i ++) {
248+ if (loops .get (i ).potentialIfIndex == index ) {
249+ return i ;
250+ }
251+ }
252+ return -1 ;
253+ }
254+
255+
256+ public static int findOpcIndex (List <JumpConditionInfo > loops , int index ) {
257+ for (int i = 0 , size = loops .size (); i < size ; i ++) {
258+ if (loops .get (i ).opcIdx == index ) {
259+ return i ;
260+ }
261+ }
262+ return -1 ;
263+ }
264+
265+
266+ public static boolean containsIndex (JumpConditionInfo cond , int index ) {
267+ var condTarget = cond .opcIdx + cond .targetOffset ;
268+ // avoid branch logic (ternary statements such as Math.min/max)
269+ return (index >= cond .opcIdx && index <= condTarget ) || (index >= condTarget && index <= cond .opcIdx );
270+ }
271+
272+
273+ /**
274+ * Check that an {@code ifCond}'s lower bound (generally its opcode index) is within a loop condition's
275+ * bounds and that the {@code ifCond}'s upper bound (generally its target index) is the instruction immediately after
276+ * the loop end instruction.
277+ * ASSUMPTION: the {@code loopCond}'s opcode index is its upper bound (i.e. the loop ends with a backward jump instruction)
278+ * @param loopCond the loop condition
279+ * @param ifCond the other condition, could be a loop or if
280+ * @return true if the conditions described above hold, false if not
281+ */
282+ public static boolean containsIfAndEndsWith (JumpConditionInfo loopCond , JumpConditionInfo ifCond ) {
283+ return loopCond .getTargetIndex () <= ifCond .getLowerIndex () &&
284+ // require the match to be a condition that jumps to the instruction after the loop
285+ loopCond .getOpcodeIndex () + loopCond .opc .getOperandCount () + 1 == ifCond .getUpperIndex ();
55286 }
56287
57288
@@ -66,18 +297,18 @@ public static int maxIndex(IntListReadOnly codeFlow) {
66297 }
67298
68299
69- public static String flowPathToString (byte [] instr , IntListReadOnly codeFlow ) {
300+ public static String flowPathToString (byte [] code , IntListReadOnly codeFlow ) {
70301 var sb = new StringBuilder ();
71302 for (int i = 0 , size = codeFlow .size (); i < size ; i ++) {
72303 var idx = codeFlow .get (i );
73304 // a conditional/jump point
74305 if (idx < 0 ) {
75- var opc = Opcodes .get (instr [~idx ] & 0xFF );
306+ var opc = Opcodes .get (code [~idx ]);
76307 sb .append (~idx ).append (' ' ).append (opc ).append (" -> " );
77308 }
78309 // a terminal point
79310 else {
80- var opc = Opcodes .get (instr [idx ] & 0xFF );
311+ var opc = Opcodes .get (code [idx ]);
81312 sb .append (idx ).append (' ' ).append (opc ).append ("], " );
82313 }
83314 }
0 commit comments