simple-ssd

created : Wed, 10 Jun 2020 19:39:41 +0900
modified : Wed, 26 Aug 2020 20:54:02 +0900
ssd

새롭게 알게된 것

다운로드


문서 읽기

그림으로 그려보기


Host Interface Layer

Host Controller

Host Interface
class DMAInterface {
 public:
  DMAInterface() {}
  virtual ~DMAInterface() {}

  virtual void dmaRead(uint64_t, uint64_t, uint8_t *, DMAFunction &,
                       void * = nullptr) = 0;
  virtual void dmaWrite(uint64_t, uint64_t, uint8_t *, DMAFunction &,
                        void * = nullptr) = 0;
};
class Interface : public SimpleSSD::DMAInterface {
 protected:
  Controller *pController;

 public:
  virtual void updateInterrupt(uint16_t, bool) = 0;
  virtual void getVendorID(uint16_t &, uint16_t &) = 0;
};
Controller and Firmware
Controller
Subsystem
Namespace

Serial AT Attachment

Host interface
Host Bus Adapter
Device

Universal Flash Storage

Host Interface
Host Controller inteface
Device

nvme 소스코드와의 비교

프로그램 시작점

BIO 발생 -> 드라이버

request로부터 opcode 분리
SimpleSSD 구현
void Driver::submitIO(BIL::BIO &bio) {
  uint32_t cmd[16];
  PRP *prp = nullptr;
  static ResponseHandler callback = [this](uint16_t status, uint32_t,
                                           void *context) {
    _io(status, context);
  };

  memset(cmd, 0, 64);

  uint64_t slba = bio.offset / LBAsize;
  uint32_t nlb = (uint32_t)DIVCEIL(bio.length, LBAsize);

  cmd[1] = namespaceID;  // NSID

  if (bio.type == BIL::BIO_READ) {
    cmd[0] = SimpleSSD::HIL::NVMe::OPCODE_READ;  // CID, FUSE, OPC
    cmd[10] = (uint32_t)slba;
    cmd[11] = slba >> 32;
    cmd[12] = nlb - 1;  // LR, FUA, PRINFO, NLB

    prp = new PRP(bio.length);
    prp->getPointer(*(uint64_t *)(cmd + 6), *(uint64_t *)(cmd + 8));  // DPTR
  }
  else if (bio.type == BIL::BIO_WRITE) {
    cmd[0] = SimpleSSD::HIL::NVMe::OPCODE_WRITE;  // CID, FUSE, OPC
    cmd[10] = (uint32_t)slba;
    cmd[11] = slba >> 32;
    cmd[12] = nlb - 1;  // LR, FUA, PRINFO, DTYPE, NLB

    prp = new PRP(bio.length);
    prp->getPointer(*(uint64_t *)(cmd + 6), *(uint64_t *)(cmd + 8));  // DPTR
  }
  else if (bio.type == BIL::BIO_FLUSH) {
    cmd[0] = SimpleSSD::HIL::NVMe::OPCODE_FLUSH;  // CID, FUSE, OPC
  }
  else if (bio.type == BIL::BIO_TRIM) {
    cmd[0] = SimpleSSD::HIL::NVMe::OPCODE_DATASET_MANAGEMEMT;  // CID, FUSE, OPC
    cmd[10] = 0;                                               // NR
    cmd[11] = 0x04;                                            // AD

    prp = new PRP(16);
    prp->getPointer(*(uint64_t *)(cmd + 6), *(uint64_t *)(cmd + 8));  // DPTR

    // Fill range definition
    uint8_t data[16];

    memset(data, 0, 16);
    memcpy(data + 4, &nlb, 4);
    memcpy(data + 8, &slba, 8);

    prp->writeData(0, 16, data);
  }

  submitCommand(1, (uint8_t *)cmd, callback,
                new IOWrapper(bio.id, prp, bio.callback));
}
nvme(driver) 구현
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
		struct nvme_command *cmd)
{
	blk_status_t ret = BLK_STS_OK;

	nvme_clear_nvme_request(req);

	memset(cmd, 0, sizeof(*cmd));
	switch (req_op(req)) {
	case REQ_OP_DRV_IN:
	case REQ_OP_DRV_OUT:
		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
		break;
	case REQ_OP_FLUSH:
		nvme_setup_flush(ns, cmd);
		break;
	case REQ_OP_WRITE_ZEROES:
		ret = nvme_setup_write_zeroes(ns, req, cmd);
		break;
	case REQ_OP_DISCARD:
		ret = nvme_setup_discard(ns, req, cmd);
		break;
	case REQ_OP_READ:
	case REQ_OP_WRITE:
		ret = nvme_setup_rw(ns, req, cmd);
		break;
	default:
		WARN_ON_ONCE(1);
		return BLK_STS_IOERR;
	}

	cmd->common.command_id = req->tag;
	trace_nvme_setup_cmd(req, cmd);
	return ret;
}
submit command
SimpleSSD 구현
void Driver::submitCommand(uint16_t iv, uint8_t *cmd, ResponseHandler &func,
                           void *context) {
  uint16_t cid = 0;
  uint16_t opcode = cmd[0];
  uint16_t tail = 0;
  uint64_t tick = engine.getCurrentTick();
  Queue *queue = nullptr;

  // Push to queue
  if (iv == 0) {
    increaseCommandID(adminCommandID);
    cid = adminCommandID;
    queue = adminSQ;
  }
  else if (iv == 1 && ioSQ) {
    increaseCommandID(ioCommandID);
    cid = ioCommandID;
    queue = ioSQ;
  }
  else {
    SimpleSSD::panic("I/O Submission Queue is not initialized");
  }

  memcpy(cmd + 2, &cid, 2);
  queue->setData(cmd, 64);
  tail = queue->getTail();

  // Push to pending cmd list
  pendingCommandList.push_back(CommandEntry(iv, opcode, cid, context, func));

  // Ring doorbell
  pController->ringSQTailDoorbell(iv, tail, tick);
  queue->incrHead();
}
nvme구현
/**
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
 * @nvmeq: The queue to use
 * @cmd: The command to send
 * @write_sq: whether to write to the SQ doorbell
 */
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
{
	spin_lock(&nvmeq->sq_lock);
	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
	       cmd, sizeof(*cmd));
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
	if (write_sq)
		nvme_write_sq_db(nvmeq);
	spin_unlock(&nvmeq->sq_lock);
}

Work

SimpleSSD
nvme(driver)

SSD Interface

Internal Cache Layer

Abstract Class

Set-Associative Cache

호출구조

void Namespace::submitCommand(SQEntryWrapper &req, RequestFunction &func) {
    /* Skip */
    // NVM commands
    else {
      switch (req.entry.dword0.opcode) {
        case OPCODE_FLUSH:
          flush(req, func);
          break;
        case OPCODE_WRITE:
          write(req, func);
          break;
        case OPCODE_READ:
          read(req, func);
          break;
        case OPCODE_COMPARE:
          compare(req, func);
          break;
        case OPCODE_DATASET_MANAGEMEMT:
          datasetManagement(req, func);
          break;
        default:
          resp.makeStatus(true, false, TYPE_GENERIC_COMMAND_STATUS,
                          STATUS_INVALID_OPCODE);

          response = true;

          break;
      }
    }
  }

  if (response) {
    func(resp);
  }
}
void Subsystem::read(Namespace *ns, uint64_t slba, uint64_t nlblk,
                     DMAFunction &func, void *context) {
  Request *req = new Request(func, context);
  DMAFunction doRead = [this](uint64_t, void *context) {
    auto req = (Request *)context;

    pHIL->read(*req);

    delete req;
  };

  convertUnit(ns, slba, nlblk, *req);

  execute(CPU::NVME__SUBSYSTEM, CPU::CONVERT_UNIT, doRead, req);
}

Cache 로직 공부

/* return 값이 true 면 hit, 아니면 miss 이다. */
bool GenericCache::read(Request &req, uint64_t &tick) {
  bool ret = false;

  debugprint(LOG_ICL_GENERIC_CACHE,
             "READ  | REQ %7u-%-4u | LCA %" PRIu64 " | SIZE %" PRIu64,
             req.reqID, req.reqSubID, req.range.slpn, req.length);

  if (useReadCaching) { /* 읽기용 Cache가 있는지를 확인한다. */
    /* start logical page number ? 의 약자인듯 */
    uint32_t setIdx = calcSetIndex(req.range.slpn); /* Set-Associative Cache 를 참조 */
    uint32_t wayIdx;
    uint64_t arrived = tick;

    /* 이건 먼지 모르겠다. 연속적인 request를 체크하는건가 ?
     * 찾아보니 prefetch 와 관련된 건데, 언제나 predict 를 sequential 이라고 생각하고 하는건가?
     * 물론 prefetch 할 영역을 고르는데 어려운 알고리즘을 쓰면 문제가 많겠지만 Sequential 만 prefetch 하는게
     * 좋다라는 논문이 있나?
     * TODO: 논문 리딩
    */
    if (useReadPrefetch) {
      checkSequential(req, readDetect);
    }

    wayIdx = getValidWay(req.range.slpn, tick);

    // Do we have valid data?
    if (wayIdx != waySize) {
      uint64_t tickBackup = tick;

      // Wait cache to be valid
      if (tick < cacheData[setIdx][wayIdx].insertedAt) {
        tick = cacheData[setIdx][wayIdx].insertedAt;
      }

      // Update last accessed time
      cacheData[setIdx][wayIdx].lastAccessed = tick;

      // DRAM access
      pDRAM->read(&cacheData[setIdx][wayIdx], req.length, tick);

      debugprint(LOG_ICL_GENERIC_CACHE,
                 "READ  | Cache hit at (%u, %u) | %" PRIu64 " - %" PRIu64
                 " (%" PRIu64 ")",
                 setIdx, wayIdx, arrived, tick, tick - arrived);

      ret = true;

      // Do we need to prefetch data?
      if (useReadPrefetch && req.range.slpn == prefetchTrigger) {
        debugprint(LOG_ICL_GENERIC_CACHE, "READ  | Prefetch triggered");

        req.range.slpn = lastPrefetched;

        // Backup tick
        arrived = tick;
        tick = tickBackup;

        goto ICL_GENERIC_CACHE_READ;
      }
    }
    // We should read data from NVM
    else {
    ICL_GENERIC_CACHE_READ:
      FTL::Request reqInternal(lineCountInSuperPage, req);
      std::vector<std::pair<uint64_t, uint64_t>> readList;
      uint32_t row, col;  // Variable for I/O position (IOFlag)
      uint64_t dramAt;
      uint64_t beginLCA, endLCA;
      uint64_t beginAt, finishedAt = tick;

      if (readDetect.enabled) {
        // TEMP: Disable DRAM calculation for prevent conflict
        pDRAM->setScheduling(false);

        if (!ret) {
          debugprint(LOG_ICL_GENERIC_CACHE, "READ  | Read ahead triggered");
        }

        beginLCA = req.range.slpn;

        // If super-page is disabled, just read all pages from all planes
        if (prefetchMode == MODE_ALL || !bSuperPage) {
          endLCA = beginLCA + lineCountInMaxIO;
          prefetchTrigger = beginLCA + lineCountInMaxIO / 2;
        }
        else {
          endLCA = beginLCA + lineCountInSuperPage;
          prefetchTrigger = beginLCA + lineCountInSuperPage / 2;
        }

        lastPrefetched = endLCA;
      }
      else {
        beginLCA = req.range.slpn;
        endLCA = beginLCA + 1;
      }

      for (uint64_t lca = beginLCA; lca < endLCA; lca++) {
        beginAt = tick;

        // Check cache
        if (getValidWay(lca, beginAt) != waySize) {
          continue;
        }

        // Find way to write data read from NVM
        setIdx = calcSetIndex(lca);
        wayIdx = getEmptyWay(setIdx, beginAt);

        if (wayIdx == waySize) {
          wayIdx = evictFunction(setIdx, beginAt);

          if (cacheData[setIdx][wayIdx].dirty) {
            // We need to evict data before write
            calcIOPosition(cacheData[setIdx][wayIdx].tag, row, col);
            evictData[row][col] = cacheData[setIdx] + wayIdx;
          }
        }

        cacheData[setIdx][wayIdx].insertedAt = beginAt;
        cacheData[setIdx][wayIdx].lastAccessed = beginAt;
        cacheData[setIdx][wayIdx].valid = true;
        cacheData[setIdx][wayIdx].dirty = false;

        readList.push_back({lca, ((uint64_t)setIdx << 32) | wayIdx});

        finishedAt = MAX(finishedAt, beginAt);
      }

      tick = finishedAt;

      evictCache(tick);

      for (auto &iter : readList) {
        Line *pLine = &cacheData[iter.second >> 32][iter.second & 0xFFFFFFFF];

        // Read data
        reqInternal.lpn = iter.first / lineCountInSuperPage;
        reqInternal.ioFlag.reset();
        reqInternal.ioFlag.set(iter.first % lineCountInSuperPage);

        beginAt = tick;  // Ignore cache metadata access

        // If superPageSizeData is true, read first LPN only
        pFTL->read(reqInternal, beginAt);

        // DRAM delay
        dramAt = pLine->insertedAt;
        pDRAM->write(pLine, lineSize, dramAt);

        // Set cache data
        beginAt = MAX(beginAt, dramAt);

        pLine->insertedAt = beginAt;
        pLine->lastAccessed = beginAt;
        pLine->tag = iter.first;

        if (pLine->tag == req.range.slpn) {
          finishedAt = beginAt;
        }

        debugprint(LOG_ICL_GENERIC_CACHE,
                   "READ  | Cache miss at (%u, %u) | %" PRIu64 " - %" PRIu64
                   " (%" PRIu64 ")",
                   iter.second >> 32, iter.second & 0xFFFFFFFF, tick, beginAt,
                   beginAt - tick);
      }

      tick = finishedAt;

      if (readDetect.enabled) {
        if (ret) {
          // This request was prefetch
          debugprint(LOG_ICL_GENERIC_CACHE, "READ  | Prefetch done");

          // Restore tick
          tick = arrived;
        }
        else {
          debugprint(LOG_ICL_GENERIC_CACHE, "READ  | Read ahead done");
        }

        // TEMP: Restore
        pDRAM->setScheduling(true);
      }
    }

    tick += applyLatency(CPU::ICL__GENERIC_CACHE, CPU::READ);
  }
  else {
    FTL::Request reqInternal(lineCountInSuperPage, req);

    pDRAM->write(nullptr, req.length, tick);

    pFTL->read(reqInternal, tick);
  }

  stat.request[0]++;

  if (ret) {
    stat.cache[0]++;
  }

  return ret;
}