Add decoder of LLVM block-stream

2026-07-20 22:41:36 +00:00 · 2019-10-18 20:57:35 +01:00
parent a9a094bcdd
commit 2a1baae8a9
2 changed files with 351 additions and 0 deletions
@@ -26,6 +26,311 @@

 namespace LLVMBC
 {
+enum class AbbrevEncoding : uint8_t
+{
+  Fixed = 1,
+  VBR = 2,
+  Array = 3,
+  Char6 = 4,
+  Blob = 5,
+  // the abbrev encoding is only 3 bits, so 8 is not representable, we can store whether or not
+  // we're a literal this way.
+  Literal = 8,
+};
+
+struct AbbrevParam
+{
+  AbbrevEncoding encoding;
+  uint64_t value;    // this is also the bitwidth for Fixed/VBR
+};
+
+struct AbbrevDesc
+{
+  rdcarray<AbbrevParam> params;
+};
+
+// the temporary context while pushing/popping blocks
+struct BlockContext
+{
+  BlockContext(size_t size = 2) : abbrevSize(size) {}
+  size_t abbrevSize;
+  rdcarray<AbbrevDesc> abbrevs;
+};
+
+// the permanent block info defined by BLOCKINFO
+struct BlockInfo
+{
+  // rdcstr blockname;
+  // rdcarray<rdcstr> recordnames;
+  rdcarray<AbbrevDesc> abbrevs;
+};
+
+enum AbbrevId
+{
+  END_BLOCK = 0,
+  ENTER_SUBBLOCK = 1,
+  DEFINE_ABBREV = 2,
+  UNABBREV_RECORD = 3,
+  APPLICATION_ABBREV = 4,
+};
+
+enum class BlockInfoRecord
+{
+  SETBID = 1,
+  BLOCKNAME = 2,
+  SETRECORDNAME = 3,
+};
+
+BitcodeReader::BitcodeReader(const byte *bitcode, size_t length) : b(bitcode, length)
+{
+  uint32_t magic = b.Read<uint32_t>();
+
+  RDCASSERT(magic == MAKE_FOURCC('B', 'C', 0xC0, 0xDE));
+}
+
+BitcodeReader::~BitcodeReader()
+{
+  for(auto it = blockInfo.begin(); it != blockInfo.end(); ++it)
+    delete it->second;
+}
+
+BlockOrRecord BitcodeReader::ReadToplevelBlock()
+{
+  BlockOrRecord ret;
+
+  // should hit ENTER_SUBBLOCK first for top-level block
+  uint32_t abbrevID = b.fixed<uint32_t>(abbrevSize());
+  RDCASSERT(abbrevID == ENTER_SUBBLOCK);
+
+  ReadBlockContents(ret);
+
+  return ret;
+}
+
+bool BitcodeReader::AtEndOfStream()
+{
+  return b.AtEndOfStream();
+}
+
+void BitcodeReader::ReadBlockContents(BlockOrRecord &block)
+{
+  block.id = b.vbr<uint32_t>(8);
+
+  blockStack.push_back(new BlockContext(b.vbr<size_t>(4)));
+
+  b.align32bits();
+  block.blockDwordLength = b.Read<uint32_t>();
+
+  // used for blockinfo only
+  BlockInfo *curBlockInfo = NULL;
+
+  uint32_t abbrevID = ~0U;
+  do
+  {
+    abbrevID = b.fixed<uint32_t>(abbrevSize());
+
+    if(abbrevID == END_BLOCK)
+    {
+      b.align32bits();
+    }
+    else if(abbrevID == ENTER_SUBBLOCK)
+    {
+      BlockOrRecord sub;
+
+      ReadBlockContents(sub);
+
+      block.children.push_back(sub);
+    }
+    else if(abbrevID == DEFINE_ABBREV)
+    {
+      AbbrevDesc a;
+
+      uint32_t numops = b.vbr<uint32_t>(5);
+
+      a.params.resize(numops);
+
+      for(uint32_t i = 0; i < numops; i++)
+      {
+        AbbrevParam &param = a.params[i];
+
+        bool lit = b.fixed<bool>(1);
+
+        if(lit)
+        {
+          param.encoding = AbbrevEncoding::Literal;
+          param.value = b.vbr<uint64_t>(8);
+        }
+        else
+        {
+          param.encoding = b.fixed<AbbrevEncoding>(3);
+
+          if(param.encoding == AbbrevEncoding::Fixed || param.encoding == AbbrevEncoding::VBR)
+          {
+            param.value = b.vbr<uint64_t>(5);
+          }
+        }
+      }
+
+      if(curBlockInfo)
+        curBlockInfo->abbrevs.push_back(a);
+      else
+        blockStack.back()->abbrevs.push_back(a);
+    }
+    else if(abbrevID == UNABBREV_RECORD)
+    {
+      BlockOrRecord r;
+      r.id = b.vbr<uint32_t>(6);
+      uint32_t numops = b.vbr<uint32_t>(6);
+      r.ops.resize(numops);
+      for(uint32_t i = 0; i < numops; i++)
+        r.ops[i] = b.vbr<uint64_t>(6);
+
+      if(block.id == 0)    // BLOCKINFO is block 0
+      {
+        switch(BlockInfoRecord(r.id))
+        {
+          case BlockInfoRecord::SETBID:
+          {
+            curBlockInfo = blockInfo[(uint32_t)r.ops[0]];
+            if(curBlockInfo == NULL)
+              curBlockInfo = blockInfo[(uint32_t)r.ops[0]] = new BlockInfo;
+            break;
+          }
+          case BlockInfoRecord::BLOCKNAME:
+          {
+            // skipped because this is so rarely used
+            /*
+            for(uint32_t i = 0; i < r.ops.size(); i++)
+              curBlockInfo->blockname.push_back((char)r.ops[i]);
+              */
+            break;
+          }
+          case BlockInfoRecord::SETRECORDNAME:
+          {
+            // skipped because this is so rarely used
+            /*
+            uint32_t record = (uint32_t)r.ops[0];
+            if(record >= curBlockInfo->recordnames.size())
+              curBlockInfo->recordnames.resize(record + 1);
+            r.ops.erase(r.ops.begin());
+            for(uint32_t i = 0; i < r.ops.size(); i++)
+              curBlockInfo->recordnames[record].push_back((char)r.ops[i]);
+              */
+            break;
+          }
+        }
+      }
+
+      block.children.push_back(r);
+    }
+    else
+    {
+      const AbbrevDesc &a = getAbbrev(block.id, abbrevID);
+
+      BlockOrRecord r;
+
+      // should have at least one param for the code itself
+      RDCASSERT(!a.params.empty());
+
+      r.id = (uint32_t)decodeAbbrevParam(a.params[0]);
+
+      // process the rest of the operands - since some might be arrays we don't know until we
+      // process it how many ops the record will end up with but it will be at least one per
+      // parameter.
+      r.ops.reserve(a.params.size() - 1);
+      for(size_t i = 1; i < a.params.size(); i++)
+      {
+        const AbbrevParam &param = a.params[i];
+
+        if(param.encoding == AbbrevEncoding::Array)
+        {
+          // must be another param to specify the value type, and it must be the last
+          RDCASSERT(i + 1 == a.params.size() - 1);
+          const AbbrevParam &elType = a.params[i + 1];
+
+          size_t arrayLen = b.vbr<size_t>(6);
+
+          for(size_t el = 0; el < arrayLen; el++)
+            r.ops.push_back(decodeAbbrevParam(elType));
+
+          break;
+        }
+        else if(param.encoding == AbbrevEncoding::Blob)
+        {
+          // blob must be the last value
+          RDCASSERT(i == a.params.size() - 1);
+          b.ReadBlob(r.blob, r.blobLength);
+
+          break;
+        }
+        else
+        {
+          r.ops.push_back(decodeAbbrevParam(param));
+        }
+      }
+
+      block.children.push_back(r);
+    }
+  } while(abbrevID != END_BLOCK);
+
+  delete blockStack.back();
+  blockStack.erase(blockStack.size() - 1);
+}
+
+uint64_t BitcodeReader::decodeAbbrevParam(const AbbrevParam &param)
+{
+  RDCASSERT(param.encoding != AbbrevEncoding::Array && param.encoding != AbbrevEncoding::Blob);
+
+  switch(param.encoding)
+  {
+    case AbbrevEncoding::Fixed: return b.fixed<uint64_t>(param.value);
+    case AbbrevEncoding::VBR: return b.vbr<uint64_t>(param.value);
+    case AbbrevEncoding::Char6: return b.c6();
+    case AbbrevEncoding::Literal: return param.value;
+    case AbbrevEncoding::Array:
+    case AbbrevEncoding::Blob: RDCERR("Array and blob types must be decoded specially");
+  }
+
+  return 0;
+}
+
+size_t BitcodeReader::abbrevSize() const
+{
+  if(blockStack.empty())
+    return 2;
+  return blockStack.back()->abbrevSize;
+}
+
+const AbbrevDesc &BitcodeReader::getAbbrev(uint32_t blockId, uint32_t abbrevID)
+{
+  const BlockInfo &info = *blockInfo[blockId];
+
+  // IDs start at the first application specified ID. Rebase to that to get 0-base indices
+  RDCASSERT(abbrevID >= APPLICATION_ABBREV);
+  abbrevID -= APPLICATION_ABBREV;
+
+  // IDs are first assigned to those permanently from BLOCKINFO
+  if(abbrevID < info.abbrevs.size())
+    return info.abbrevs[abbrevID];
+
+  // block-local IDs start after the BLOCKINFO ones
+  abbrevID -= (uint32_t)info.abbrevs.size();
+
+  RDCASSERT(!blockStack.empty());
+  RDCASSERT(abbrevID < blockStack.back()->abbrevs.size());
+
+  return blockStack.back()->abbrevs[abbrevID];
+}
+
+rdcstr BlockOrRecord::getString(size_t startOffset) const
+{
+  rdcstr ret;
+  ret.resize(ops.size() - startOffset);
+  for(size_t i = 0; i < ret.size(); i++)
+    ret[i] = (char)ops[i + startOffset];
+  return ret;
+}
+
 };    // namespace LLVMBC

 #if ENABLED(ENABLE_UNIT_TESTS)
@@ -24,8 +24,54 @@

 #pragma once

+#include <map>
 #include "llvm_bitreader.h"

 namespace LLVMBC
 {
+struct BlockOrRecord
+{
+  uint32_t id;
+  uint32_t blockDwordLength = 0;    // 0 for records
+
+  bool IsBlock() const { return blockDwordLength > 0; }
+  bool IsRecord() const { return blockDwordLength == 0; }
+  // if a block, the child blocks/records
+  rdcarray<BlockOrRecord> children;
+
+  rdcstr getString(size_t startOffset = 0) const;
+
+  // if a record, the ops
+  rdcarray<uint64_t> ops;
+  // if this is an abbreviated record with a blob, this is the last operand
+  // this points into the overall byte storage, so the lifetime is limited.
+  const byte *blob = NULL;
+  size_t blobLength = 0;
+};
+
+struct AbbrevParam;
+struct AbbrevDesc;
+struct BlockContext;
+struct BlockInfo;
+
+class BitcodeReader
+{
+public:
+  BitcodeReader(const byte *bitcode, size_t length);
+  ~BitcodeReader();
+  BlockOrRecord ReadToplevelBlock();
+  bool AtEndOfStream();
+
+private:
+  BitReader b;
+
+  void ReadBlockContents(BlockOrRecord &block);
+  const AbbrevDesc &getAbbrev(uint32_t blockId, uint32_t abbrevID);
+  size_t abbrevSize() const;
+  uint64_t decodeAbbrevParam(const AbbrevParam &param);
+
+  rdcarray<BlockContext *> blockStack;
+  std::map<uint32_t, BlockInfo *> blockInfo;
+};
+
 };    // namespace LLVMBC