LLVM  8.0.1
ARMParallelDSP.cpp
Go to the documentation of this file.
1 //===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
12 /// purpose of this pass is do some IR pattern matching to create ACLE
13 /// DSP intrinsics, which map on these 32-bit SIMD operations.
14 /// This pass runs only when unaligned accesses is supported/enabled.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/SmallPtrSet.h"
22 #include "llvm/Analysis/LoopPass.h"
23 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/IR/NoFolder.h"
26 #include "llvm/Transforms/Scalar.h"
29 #include "llvm/Pass.h"
30 #include "llvm/PassRegistry.h"
31 #include "llvm/PassSupport.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/IR/PatternMatch.h"
35 #include "ARM.h"
36 #include "ARMSubtarget.h"
37 
38 using namespace llvm;
39 using namespace PatternMatch;
40 
41 #define DEBUG_TYPE "arm-parallel-dsp"
42 
43 STATISTIC(NumSMLAD , "Number of smlad instructions generated");
44 
45 static cl::opt<bool>
46 DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
47  cl::desc("Disable the ARM Parallel DSP pass"));
48 
49 namespace {
50  struct OpChain;
51  struct BinOpChain;
52  struct Reduction;
53 
54  using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>;
55  using ReductionList = SmallVector<Reduction, 8>;
56  using ValueList = SmallVector<Value*, 8>;
57  using MemInstList = SmallVector<Instruction*, 8>;
58  using PMACPair = std::pair<BinOpChain*,BinOpChain*>;
59  using PMACPairList = SmallVector<PMACPair, 8>;
60  using Instructions = SmallVector<Instruction*,16>;
61  using MemLocList = SmallVector<MemoryLocation, 4>;
62 
63  struct OpChain {
64  Instruction *Root;
65  ValueList AllValues;
66  MemInstList VecLd; // List of all load instructions.
67  MemLocList MemLocs; // All memory locations read by this tree.
68  bool ReadOnly = true;
69 
70  OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
71  virtual ~OpChain() = default;
72 
73  void SetMemoryLocations() {
74  const auto Size = LocationSize::unknown();
75  for (auto *V : AllValues) {
76  if (auto *I = dyn_cast<Instruction>(V)) {
77  if (I->mayWriteToMemory())
78  ReadOnly = false;
79  if (auto *Ld = dyn_cast<LoadInst>(V))
80  MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
81  }
82  }
83  }
84 
85  unsigned size() const { return AllValues.size(); }
86  };
87 
88  // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures.
89  // 'Reduction' contains the phi-node and accumulator statement from where we
90  // start pattern matching, and 'BinOpChain' the multiplication
91  // instructions that are candidates for parallel execution.
92  struct BinOpChain : public OpChain {
93  ValueList LHS; // List of all (narrow) left hand operands.
94  ValueList RHS; // List of all (narrow) right hand operands.
95  bool Exchange = false;
96 
97  BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
98  OpChain(I, lhs), LHS(lhs), RHS(rhs) {
99  for (auto *V : RHS)
100  AllValues.push_back(V);
101  }
102 
103  bool AreSymmetrical(BinOpChain *Other);
104  };
105 
106  struct Reduction {
107  PHINode *Phi; // The Phi-node from where we start
108  // pattern matching.
109  Instruction *AccIntAdd; // The accumulating integer add statement,
110  // i.e, the reduction statement.
111  OpChainList MACCandidates; // The MAC candidates associated with
112  // this reduction statement.
113  PMACPairList PMACPairs;
114  Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
115  };
116 
117  class ARMParallelDSP : public LoopPass {
118  ScalarEvolution *SE;
119  AliasAnalysis *AA;
120  TargetLibraryInfo *TLI;
121  DominatorTree *DT;
122  LoopInfo *LI;
123  Loop *L;
124  const DataLayout *DL;
125  Module *M;
126  std::map<LoadInst*, LoadInst*> LoadPairs;
127  std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
128 
129  bool RecordSequentialLoads(BasicBlock *Header);
130  bool InsertParallelMACs(Reduction &Reduction);
131  bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
132  void CreateParallelMACPairs(Reduction &R);
133  Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
134  Instruction *Acc, bool Exchange,
135  Instruction *InsertAfter);
136 
137  /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
138  /// Dual performs two signed 16x16-bit multiplications. It adds the
139  /// products to a 32-bit accumulate operand. Optionally, the instruction can
140  /// exchange the halfwords of the second operand before performing the
141  /// arithmetic.
142  bool MatchSMLAD(Function &F);
143 
144  public:
145  static char ID;
146 
147  ARMParallelDSP() : LoopPass(ID) { }
148 
149  void getAnalysisUsage(AnalysisUsage &AU) const override {
159  AU.setPreservesCFG();
160  }
161 
162  bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
163  if (DisableParallelDSP)
164  return false;
165  L = TheLoop;
166  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
167  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
168  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
169  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
170  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
171  auto &TPC = getAnalysis<TargetPassConfig>();
172 
173  BasicBlock *Header = TheLoop->getHeader();
174  if (!Header)
175  return false;
176 
177  // TODO: We assume the loop header and latch to be the same block.
178  // This is not a fundamental restriction, but lifting this would just
179  // require more work to do the transformation and then patch up the CFG.
180  if (Header != TheLoop->getLoopLatch()) {
181  LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
182  "running pass ARMParallelDSP\n");
183  return false;
184  }
185 
186  Function &F = *Header->getParent();
187  M = F.getParent();
188  DL = &M->getDataLayout();
189 
190  auto &TM = TPC.getTM<TargetMachine>();
191  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
192 
193  if (!ST->allowsUnalignedMem()) {
194  LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
195  "running pass ARMParallelDSP\n");
196  return false;
197  }
198 
199  if (!ST->hasDSP()) {
200  LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
201  "ARMParallelDSP\n");
202  return false;
203  }
204 
205  LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
206  bool Changes = false;
207 
208  LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
209  LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
210 
211  if (!RecordSequentialLoads(Header)) {
212  LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
213  return false;
214  }
215 
216  Changes = MatchSMLAD(F);
217  return Changes;
218  }
219  };
220 }
221 
222 // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
223 // instructions, which is set to 16. So here we should collect all i8 and i16
224 // narrow operations.
225 // TODO: we currently only collect i16, and will support i8 later, so that's
226 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
227 template<unsigned MaxBitWidth>
228 static bool IsNarrowSequence(Value *V, ValueList &VL) {
229  LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
230  ConstantInt *CInt;
231 
232  if (match(V, m_ConstantInt(CInt))) {
233  // TODO: if a constant is used, it needs to fit within the bit width.
234  return false;
235  }
236 
237  auto *I = dyn_cast<Instruction>(V);
238  if (!I)
239  return false;
240 
241  Value *Val, *LHS, *RHS;
242  if (match(V, m_Trunc(m_Value(Val)))) {
243  if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
244  return IsNarrowSequence<MaxBitWidth>(Val, VL);
245  } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
246  // TODO: we need to implement sadd16/sadd8 for this, which enables to
247  // also do the rewrite for smlad8.ll, but it is unsupported for now.
248  LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
249  return false;
250  } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
251  if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
252  LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
253  cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
254  return false;
255  }
256 
257  if (match(Val, m_Load(m_Value()))) {
258  LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
259  VL.push_back(Val);
260  VL.push_back(I);
261  return true;
262  }
263  }
264  LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
265  return false;
266 }
267 
268 template<typename MemInst>
269 static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
270  const DataLayout &DL, ScalarEvolution &SE) {
271  if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
272  LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
273  return false;
274  }
275  if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
276  LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
277  return true;
278  }
279  LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
280  return false;
281 }
282 
283 bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
284  MemInstList &VecMem) {
285  if (!Ld0 || !Ld1)
286  return false;
287 
288  LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
289  dbgs() << "Ld0:"; Ld0->dump();
290  dbgs() << "Ld1:"; Ld1->dump();
291  );
292 
293  if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
294  LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
295  return false;
296  }
297 
298  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
299  return false;
300 
301  VecMem.clear();
302  VecMem.push_back(Ld0);
303  VecMem.push_back(Ld1);
304  return true;
305 }
306 
307 /// Iterate through the block and record base, offset pairs of loads as well as
308 /// maximal sequences of sequential loads.
309 bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
311  for (auto &I : *Header) {
312  auto *Ld = dyn_cast<LoadInst>(&I);
313  if (!Ld)
314  continue;
315  Loads.push_back(Ld);
316  }
317 
318  std::map<LoadInst*, LoadInst*> BaseLoads;
319 
320  for (auto *Ld0 : Loads) {
321  for (auto *Ld1 : Loads) {
322  if (Ld0 == Ld1)
323  continue;
324 
325  if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
326  LoadPairs[Ld0] = Ld1;
327  if (BaseLoads.count(Ld0)) {
328  LoadInst *Base = BaseLoads[Ld0];
329  BaseLoads[Ld1] = Base;
330  SequentialLoads[Base].push_back(Ld1);
331  } else {
332  BaseLoads[Ld1] = Ld0;
333  SequentialLoads[Ld0].push_back(Ld1);
334  }
335  }
336  }
337  }
338  return LoadPairs.size() > 1;
339 }
340 
341 void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
342  OpChainList &Candidates = R.MACCandidates;
343  PMACPairList &PMACPairs = R.PMACPairs;
344  const unsigned Elems = Candidates.size();
345 
346  if (Elems < 2)
347  return;
348 
349  auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) {
350  if (!PMul0->AreSymmetrical(PMul1))
351  return false;
352 
353  // The first elements of each vector should be loads with sexts. If we
354  // find that its two pairs of consecutive loads, then these can be
355  // transformed into two wider loads and the users can be replaced with
356  // DSP intrinsics.
357  for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
358  auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
359  auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
360  auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
361  auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
362 
363  if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
364  return false;
365 
366  LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
367  << "\t Ld0: " << *Ld0 << "\n"
368  << "\t Ld1: " << *Ld1 << "\n"
369  << "and operands " << x + 2 << ":\n"
370  << "\t Ld2: " << *Ld2 << "\n"
371  << "\t Ld3: " << *Ld3 << "\n");
372 
373  if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
374  if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
375  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
376  PMACPairs.push_back(std::make_pair(PMul0, PMul1));
377  return true;
378  } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
379  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
380  LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
381  PMul1->Exchange = true;
382  PMACPairs.push_back(std::make_pair(PMul0, PMul1));
383  return true;
384  }
385  } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
386  AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
387  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
388  LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
389  LLVM_DEBUG(dbgs() << " and swapping muls\n");
390  PMul0->Exchange = true;
391  // Only the second operand can be exchanged, so swap the muls.
392  PMACPairs.push_back(std::make_pair(PMul1, PMul0));
393  return true;
394  }
395  }
396  return false;
397  };
398 
400  for (unsigned i = 0; i < Elems; ++i) {
401  BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get());
402  if (Paired.count(PMul0->Root))
403  continue;
404 
405  for (unsigned j = 0; j < Elems; ++j) {
406  if (i == j)
407  continue;
408 
409  BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[j].get());
410  if (Paired.count(PMul1->Root))
411  continue;
412 
413  const Instruction *Mul0 = PMul0->Root;
414  const Instruction *Mul1 = PMul1->Root;
415  if (Mul0 == Mul1)
416  continue;
417 
418  assert(PMul0 != PMul1 && "expected different chains");
419 
420  LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
421  dbgs() << "- "; Mul0->dump();
422  dbgs() << "- "; Mul1->dump());
423 
424  LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
425  if (CanPair(PMul0, PMul1)) {
426  Paired.insert(Mul0);
427  Paired.insert(Mul1);
428  break;
429  }
430  }
431  }
432 }
433 
434 bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
435  Instruction *Acc = Reduction.Phi;
436  Instruction *InsertAfter = Reduction.AccIntAdd;
437 
438  for (auto &Pair : Reduction.PMACPairs) {
439  BinOpChain *PMul0 = Pair.first;
440  BinOpChain *PMul1 = Pair.second;
441  LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
442  dbgs() << "- "; PMul0->Root->dump();
443  dbgs() << "- "; PMul1->Root->dump());
444 
445  auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]);
446  auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]);
447  Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter);
448  InsertAfter = Acc;
449  }
450 
451  if (Acc != Reduction.Phi) {
452  LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
453  Reduction.AccIntAdd->replaceAllUsesWith(Acc);
454  return true;
455  }
456  return false;
457 }
458 
459 static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
460  ReductionList &Reductions) {
461  RecurrenceDescriptor RecDesc;
462  const bool HasFnNoNaNAttr =
463  F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
464  const BasicBlock *Latch = TheLoop->getLoopLatch();
465 
466  // We need a preheader as getIncomingValueForBlock assumes there is one.
467  if (!TheLoop->getLoopPreheader()) {
468  LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
469  return;
470  }
471 
472  for (PHINode &Phi : Header->phis()) {
473  const auto *Ty = Phi.getType();
474  if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
475  continue;
476 
477  const bool IsReduction =
480  TheLoop, HasFnNoNaNAttr, RecDesc);
481  if (!IsReduction)
482  continue;
483 
484  Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
485  if (!Acc)
486  continue;
487 
488  Reductions.push_back(Reduction(&Phi, Acc));
489  }
490 
491  LLVM_DEBUG(
492  dbgs() << "\nAccumulating integer additions (reductions) found:\n";
493  for (auto &R : Reductions) {
494  dbgs() << "- "; R.Phi->dump();
495  dbgs() << "-> "; R.AccIntAdd->dump();
496  }
497  );
498 }
499 
500 static void AddMACCandidate(OpChainList &Candidates,
501  Instruction *Mul,
502  Value *MulOp0, Value *MulOp1) {
503  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
504  assert(Mul->getOpcode() == Instruction::Mul &&
505  "expected mul instruction");
506  ValueList LHS;
507  ValueList RHS;
508  if (IsNarrowSequence<16>(MulOp0, LHS) &&
509  IsNarrowSequence<16>(MulOp1, RHS)) {
510  LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
511  Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
512  }
513 }
514 
515 static void MatchParallelMACSequences(Reduction &R,
516  OpChainList &Candidates) {
517  Instruction *Acc = R.AccIntAdd;
518  LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc);
519 
520  // Returns false to signal the search should be stopped.
521  std::function<bool(Value*)> Match =
522  [&Candidates, &Match](Value *V) -> bool {
523 
524  auto *I = dyn_cast<Instruction>(V);
525  if (!I)
526  return false;
527 
528  switch (I->getOpcode()) {
529  case Instruction::Add:
530  if (Match(I->getOperand(0)) || (Match(I->getOperand(1))))
531  return true;
532  break;
533  case Instruction::Mul: {
534  Value *MulOp0 = I->getOperand(0);
535  Value *MulOp1 = I->getOperand(1);
536  if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1))
537  AddMACCandidate(Candidates, I, MulOp0, MulOp1);
538  return false;
539  }
540  case Instruction::SExt:
541  return Match(I->getOperand(0));
542  }
543  return false;
544  };
545 
546  while (Match (Acc));
547  LLVM_DEBUG(dbgs() << "Finished matching MAC sequences, found "
548  << Candidates.size() << " candidates.\n");
549 }
550 
551 // Collects all instructions that are not part of the MAC chains, which is the
552 // set of instructions that can potentially alias with the MAC operands.
553 static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
554  Instructions &Writes) {
555  for (auto &I : *Header) {
556  if (I.mayReadFromMemory())
557  Reads.push_back(&I);
558  if (I.mayWriteToMemory())
559  Writes.push_back(&I);
560  }
561 }
562 
563 // Check whether statements in the basic block that write to memory alias with
564 // the memory locations accessed by the MAC-chains.
565 // TODO: we need the read statements when we accept more complicated chains.
566 static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
567  Instructions &Writes, OpChainList &MACCandidates) {
568  LLVM_DEBUG(dbgs() << "Alias checks:\n");
569  for (auto &MAC : MACCandidates) {
570  LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
571 
572  // At the moment, we allow only simple chains that only consist of reads,
573  // accumulate their result with an integer add, and thus that don't write
574  // memory, and simply bail if they do.
575  if (!MAC->ReadOnly)
576  return true;
577 
578  // Now for all writes in the basic block, check that they don't alias with
579  // the memory locations accessed by our MAC-chain:
580  for (auto *I : Writes) {
581  LLVM_DEBUG(dbgs() << "- "; I->dump());
582  assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
583  for (auto &MemLoc : MAC->MemLocs) {
584  if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
585  ModRefInfo::ModRef))) {
586  LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
587  return true;
588  }
589  }
590  }
591  }
592 
593  LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
594  return false;
595 }
596 
597 static bool CheckMACMemory(OpChainList &Candidates) {
598  for (auto &C : Candidates) {
599  // A mul has 2 operands, and a narrow op consist of sext and a load; thus
600  // we expect at least 4 items in this operand value list.
601  if (C->size() < 4) {
602  LLVM_DEBUG(dbgs() << "Operand list too short.\n");
603  return false;
604  }
605  C->SetMemoryLocations();
606  ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
607  ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
608 
609  // Use +=2 to skip over the expected extend instructions.
610  for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
611  if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
612  return false;
613  }
614  }
615  return true;
616 }
617 
618 // Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
619 // multiplications.
620 // To use SMLAD:
621 // 1) we first need to find integer add reduction PHIs,
622 // 2) then from the PHI, look for this pattern:
623 //
624 // acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
625 // ld0 = load i16
626 // sext0 = sext i16 %ld0 to i32
627 // ld1 = load i16
628 // sext1 = sext i16 %ld1 to i32
629 // mul0 = mul %sext0, %sext1
630 // ld2 = load i16
631 // sext2 = sext i16 %ld2 to i32
632 // ld3 = load i16
633 // sext3 = sext i16 %ld3 to i32
634 // mul1 = mul i32 %sext2, %sext3
635 // add0 = add i32 %mul0, %acc0
636 // acc1 = add i32 %add0, %mul1
637 //
638 // Which can be selected to:
639 //
640 // ldr.h r0
641 // ldr.h r1
642 // smlad r2, r0, r1, r2
643 //
644 // If constants are used instead of loads, these will need to be hoisted
645 // out and into a register.
646 //
647 // If loop invariants are used instead of loads, these need to be packed
648 // before the loop begins.
649 //
650 bool ARMParallelDSP::MatchSMLAD(Function &F) {
651  BasicBlock *Header = L->getHeader();
652  LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
653  dbgs() << "Header block:\n"; Header->dump();
654  dbgs() << "Loop info:\n\n"; L->dump());
655 
656  bool Changed = false;
657  ReductionList Reductions;
658  MatchReductions(F, L, Header, Reductions);
659 
660  for (auto &R : Reductions) {
661  OpChainList MACCandidates;
662  MatchParallelMACSequences(R, MACCandidates);
663  if (!CheckMACMemory(MACCandidates))
664  continue;
665 
666  R.MACCandidates = std::move(MACCandidates);
667 
668  LLVM_DEBUG(dbgs() << "MAC candidates:\n";
669  for (auto &M : R.MACCandidates)
670  M->Root->dump();
671  dbgs() << "\n";);
672  }
673 
674  // Collect all instructions that may read or write memory. Our alias
675  // analysis checks bail out if any of these instructions aliases with an
676  // instruction from the MAC-chain.
677  Instructions Reads, Writes;
678  AliasCandidates(Header, Reads, Writes);
679 
680  for (auto &R : Reductions) {
681  if (AreAliased(AA, Reads, Writes, R.MACCandidates))
682  return false;
683  CreateParallelMACPairs(R);
684  Changed |= InsertParallelMACs(R);
685  }
686 
687  LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
688  return Changed;
689 }
690 
692  const Type *LoadTy) {
693  const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
694 
695  Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
696  LoadTy->getPointerTo(AddrSpace));
697  return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment());
698 }
699 
700 Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
701  Instruction *Acc, bool Exchange,
702  Instruction *InsertAfter) {
703  LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"
704  << "- " << *VecLd0 << "\n"
705  << "- " << *VecLd1 << "\n"
706  << "- " << *Acc << "\n"
707  << "Exchange: " << Exchange << "\n");
708 
709  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
710  ++BasicBlock::iterator(InsertAfter));
711 
712  // Replace the reduction chain with an intrinsic call
713  const Type *Ty = IntegerType::get(M->getContext(), 32);
714  LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
715  LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
716  Value* Args[] = { NewLd0, NewLd1, Acc };
717  Function *SMLAD = nullptr;
718  if (Exchange)
719  SMLAD = Acc->getType()->isIntegerTy(32) ?
722  else
723  SMLAD = Acc->getType()->isIntegerTy(32) ?
726  CallInst *Call = Builder.CreateCall(SMLAD, Args);
727  NumSMLAD++;
728  return Call;
729 }
730 
731 // Compare the value lists in Other to this chain.
732 bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
733  // Element-by-element comparison of Value lists returning true if they are
734  // instructions with the same opcode or constants with the same value.
735  auto CompareValueList = [](const ValueList &VL0,
736  const ValueList &VL1) {
737  if (VL0.size() != VL1.size()) {
738  LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
739  << VL0.size() << " != " << VL1.size() << "\n");
740  return false;
741  }
742 
743  const unsigned Pairs = VL0.size();
744  LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
745 
746  for (unsigned i = 0; i < Pairs; ++i) {
747  const Value *V0 = VL0[i];
748  const Value *V1 = VL1[i];
749  const auto *Inst0 = dyn_cast<Instruction>(V0);
750  const auto *Inst1 = dyn_cast<Instruction>(V1);
751 
752  LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
753  dbgs() << "mul1: "; V0->dump();
754  dbgs() << "mul2: "; V1->dump());
755 
756  if (!Inst0 || !Inst1)
757  return false;
758 
759  if (Inst0->isSameOperationAs(Inst1)) {
760  LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
761  continue;
762  }
763 
764  const APInt *C0, *C1;
765  if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
766  return false;
767  }
768 
769  LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
770  return true;
771  };
772 
773  return CompareValueList(LHS, Other->LHS) &&
774  CompareValueList(RHS, Other->RHS);
775 }
776 
778  return new ARMParallelDSP();
779 }
780 
781 char ARMParallelDSP::ID = 0;
782 
783 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
784  "Transform loops to use DSP intrinsics", false, false)
785 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
786  "Transform loops to use DSP intrinsics", false, false)
The access may reference and may modify the value stored in memory.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
static LoadInst * CreateLoadIns(IRBuilder< NoFolder > &IRB, LoadInst &BaseLoad, const Type *LoadTy)
uint64_t CallInst * C
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:71
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:225
This class represents lattice values for constants.
Definition: AllocatorList.h:24
static cl::opt< bool > DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass"))
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:65
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, unsigned Align, const char *Name)
Provided to resolve &#39;CreateAlignedLoad(Ptr, Align, "...")&#39; correctly, instead of converting the strin...
Definition: IRBuilder.h:1393
static constexpr LocationSize unknown()
arm parallel dsp
static bool AddReductionVar(PHINode *Phi, RecurrenceKind Kind, Loop *TheLoop, bool HasFunNoNaNAttr, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr)
Returns true if Phi is a reduction of type Kind and adds it to the RecurrenceDescriptor.
static bool AreAliased(AliasAnalysis *AA, Instructions &Reads, Instructions &Writes, OpChainList &MACCandidates)
The main scalar evolution driver.
This class represents a function call, abstracting a target machine&#39;s calling convention.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:174
static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1, const DataLayout &DL, ScalarEvolution &SE)
An immutable pass that tracks lazily created AssumptionCache objects.
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
STATISTIC(NumFunctions, "Total number of functions")
F(f)
An instruction for reading from memory.
Definition: Instructions.h:168
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", "Transform loops to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4298
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:48
AnalysisUsage & addRequired()
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:371
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:244
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:652
CastClass_match< OpTy, Instruction::Trunc > m_Trunc(const OpTy &Op)
Matches Trunc.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:743
arm parallel Transform loops to use DSP intrinsics
Target-Independent Code Generator Pass Configuration Options.
ELFYAML::ELF_STO Other
Definition: ELFYAML.cpp:784
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:642
BlockT * getHeader() const
Definition: LoopInfo.h:100
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1732
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:92
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:82
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:126
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1020
Value * getOperand(unsigned i) const
Definition: User.h:170
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:423
Pass * createARMParallelDSPPass()
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt...
Definition: PatternMatch.h:176
void dump() const
Definition: LoopInfo.cpp:401
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
static bool IsNarrowSequence(Value *V, ValueList &VL)
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
static void AliasCandidates(BasicBlock *Header, Instructions &Reads, Instructions &Writes)
void dump() const
Dump the module to stderr (for debugging).
Definition: AsmWriter.cpp:4306
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
Represent the analysis usage information of a pass.
match_combine_or< CastClass_match< OpTy, Instruction::ZExt >, CastClass_match< OpTy, Instruction::SExt > > m_ZExtOrSExt(const OpTy &Op)
Value * getPointerOperand()
Definition: Instructions.h:285
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:63
Representation for a specific memory location.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
auto size(R &&Range, typename std::enable_if< std::is_same< typename std::iterator_traits< decltype(Range.begin())>::iterator_category, std::random_access_iterator_tag >::value, void >::type *=nullptr) -> decltype(std::distance(Range.begin(), Range.end()))
Get the size of a range.
Definition: STLExtras.h:1167
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
Provides information about what library functions are available for the current target.
Drive the analysis of memory accesses in the loop.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
Class for arbitrary precision integers.
Definition: APInt.h:70
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
loop Loop Strength Reduction
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:241
StringRef getValueAsString() const
Return the attribute&#39;s value as a string.
Definition: Attributes.cpp:195
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:465
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
#define I(x, y, z)
Definition: MD5.cpp:58
bool mayReadFromMemory() const
Return true if this instruction may read memory.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
uint32_t Size
Definition: Profile.cpp:47
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:325
pgo instr use
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:291
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566
LLVM Value Representation.
Definition: Value.h:73
static void AddMACCandidate(OpChainList &Candidates, Instruction *Mul, Value *MulOp0, Value *MulOp1)
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.h:331
static void MatchParallelMACSequences(Reduction &R, OpChainList &Candidates)
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
The legacy pass manager&#39;s analysis pass to compute loop information.
Definition: LoopInfo.h:970
bool hasOneUse() const
Return true if there is exactly one user of this value.
Definition: Value.h:413
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:260
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
static bool CheckMACMemory(OpChainList &Candidates)
#define LLVM_DEBUG(X)
Definition: Debug.h:123
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:274
loops
Definition: LoopInfo.cpp:772
const BasicBlock * getParent() const
Definition: Instruction.h:67
static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header, ReductionList &Reductions)