当前位置：网站首页>E-book analysis

E-book analysis

2022-06-12 05:54:00 【Snow flies fast】

Book Constructors

Book Objects are divided into two scenes ：

Parse directly from the e-book file Book object
from data Object Book object

class Book {
    
  constructor(file, data) {
    
    if (file) {
    
      this.createBookFromFile(file)
    } else {
    
      this.createBookFromData(data)
    }
  }

  createBookFromFile(file) {
    
    console.log('createBookFromFile', file)
  }

  createBookFromData(data) {
    
    console.log('createBookFromData', data)
  }
}

module.exports = Book

Create... From file Book object

stay utils\constant.js Add the following three constants ：

MimeType May be application/epub or application/epub+zip , Here I use the string StartWith Method to judge

const {
     env } = require('./env')
const UPLOAD_PATH =
  env === 'dev' ? 'E:/upload/admin-upload-ebook' : '/root/upload/admin-upload-ebook'

const UPLOAD_URL = env === 'dev' ? 'http://127.0.0.1:8089/admin-upload-ebook' : 'http://www.book.llmysnow.top/admin-upload-ebook'

module.exports = {
    
  // ...
  UPLOAD_PATH,
  MIME_TYPE_EPUB: 'application/epub',
  UPLOAD_URL
}

After reading the ebook from the file , initialization Book object

const {
     MIME_TYPE_EPUB, UPLOAD_URL, UPLOAD_PATH } = require('../utils/constant')
const fs = require('fs')

class Book {
    
  // ...
  createBookFromFile(file) {
    
    const {
    
      destination: des, //  File local storage directory 
      filename, // File name 
      path, //  File path 
      mimetype = MIME_TYPE_EPUB, //  File resource type 
      originalname //  The original name of the document 
    } = file
    //  E-book file suffix 
    const suffix = mimetype.startsWith(MIME_TYPE_EPUB) ? '.epub' : ''
    //  E-book original path 
    const oldBookPath = path
    //  The new path of e-books 
    const bookPath = `${
      des}/${
      filename}${
      suffix}`
    //  E-book download URL link 
    const url = `${
      UPLOAD_URL}/book/${
      filename}${
      suffix}`
    //  E-book folder path after decompression 
    const unzipPath = `${
      UPLOAD_PATH}/unzip/${
      filename}`
    //  E-book unzipped folder URL
    const unzipUrl = `${
      UPLOAD_URL}/unzip/${
      filename}`

    if (!fs.existsSync(unzipPath)) {
    
      fs.mkdirSync(unzipPath, {
     recursive: true }) //  Iteratively create the unzipped folder 
    }
    if (fs.existsSync(oldBookPath) && !fs.existsSync(bookPath)) {
    
      fs.renameSync(oldBookPath, bookPath) //  Rename file 
    }

    this.fileName = filename //  file name 
    this.path = `/book/${
      filename}${
      suffix}` // epub Folder relative path 
    this.filePath = this.path // epub Folder relative path 
    this.unzipPath = `/unzip/${
      filename}` // epub Relative path after decompression 
    this.url = url // epub File download link 
    this.title = '' //  Title 
    this.author = '' //  author 
    this.publisher = '' //  Press. 
    this.contents = [] //  Catalog 
    this.cover = '' //  Cover picture URL
    this.coverPath = '' //  Cover picture path 
    this.category = -1 //  classification ID
    this.categoryText = '' //  Category name 
    this.language = '' //  Languages 
    this.unzipUrl = unzipUrl //  Unzipped folder link 
    this.originalName = originalname //  The original name of the e-book 
  }
}

E-book analysis

After the initialization , You can call Book Example of parse Method to parse e-books

Here we use epub library Source code ：, We will directly epub.js copy to utils\epub.js

install epub.js The two libraries on which it depends ：

npm i xml2js adm-zip

Use epub Library parsing eBook

stay models\Book.js Of Book Class parse Method

analysis epub.metadata The data in is assigned to this On

const Epub = require('../utils/epub')

class Book {
    
  // ....
  parse() {
    
    return new Promise((resolve, reject) => {
    
      const bookPath = `${
      UPLOAD_PATH}${
      this.path}`
      if (!fs.existsSync(bookPath)) reject(new Error(' E-books don't exist '))
      const epub = new Epub(bookPath)
      epub.on('error', err => reject(err))
      epub.on('end', err => {
    
        if (err) reject(err)
        const {
     title, creator, creatorFileAs, language, publisher, cover } = epub.metadata
        if (!title) reject(new Error(' Book title is empty '))
        this.title = title
        this.language = language || 'en'
        this.author = creator || creatorFileAs || 'unknown'
        this.publisher = publisher || 'unknown'
        this.rootFile = epub.rootFile
        const handleGetImage = (err, file, mimeType) => {
    
          if (err) reject(err)
          const suffix = mimeType && mimeType.split('/')[1]
          const coverPath = `${
      UPLOAD_PATH}/img/${
      this.fileName}.${
      suffix}`
          const coverUrl = `${
      UPLOAD_URL}/img/${
      this.fileName}.${
      suffix}`
          fs.writeFileSync(coverPath, file, 'binary')
          this.coverPath = `/img/${
      this.fileName}.${
      suffix}`
          this.cover = coverUrl
          resolve(this)
        }
        try {
    
          this.unzip() //  Unzip the ebook 
          this.parseContents(epub)
            .then(({
      chapters, chapterTree }) => {
    
              this.contents = chapters
              this.contentsTree = chapterTree
              epub.getImage(cover, handleGetImage) //  Get the cover picture 
            })
            .catch(err => reject(err))
        } catch (e) {
    
          reject(e)
        }
      })
      epub.parse()
    })
  }
}

Use epub Library to get pictures

epub The format and content of this article can be reviewed ： Technical structure of the project Medium ePub e-book

content.opf （ Some may not call this , The file point can be viewed container.xml This file . In general, look for opf Format file is right ） There are about five parts , There is a part called manifest File list , There's a cover inside cover Information about

<meta name="cover" content="cover-image"/>
<item id="cover-image" href="images/cover.jpg" media-type="image/jpeg"/>

<!--  The other is not meta The label is cover Of , adopt properties obtain  -->
<item id="Aimages_978-3-319-64337-3_CoverFigure" href="images/978-3-319-64337-3_CoverFigure.jpg" media-type="image/jpeg" properties="cover-image"/>

Modify to get utils\epub.js Get the cover image source code

If you encounter other formats , It can be improved here in the later stage

getImage(id, callback) {
    
  if (this.manifest[id]) {
    

    if ((this.manifest[id]['media-type'] || "").toLowerCase().trim().substr(0, 6) != "image/") {
    
      return callback(new Error("Invalid mime type for image"));
    }

    this.getFile(id, callback);
  } else {
    
    const coverId = Object.keys(this.manifest).find(key =>
      this.manifest[key].properties === 'cover-image'
    )
    if (coverId) {
    
      this.getFile(coverId, callback)
    } else {
    
      callback(new Error("File not found"));
    }
  }
};

E-book catalog analysis

During e-book parsing, we need to define e-book directory parsing , The first step is to unzip the e-book

class Book {
    
  //...
  unzip() {
    
    const AdmZip = require('adm-zip')
    const zip = new AdmZip(Book.genPath(this.path)) //  Parse file path 
    zip.extractAllTo(Book.genPath(this.unzipPath), true)
  }
}

genPath yes Book An attribute method of , have access to static Property to declare

class Book {
    
  //...
  static genPath(path) {
    
    if (!path.startsWith('/')) path = `/${
      path}`
    return `${
      UPLOAD_PATH}${
      path}`
  }
}

E-book parsing algorithm

First of all get epub example spine Medium toc Of href（ If not, go through manifest obtain ）, Then read the corresponding address according to this address ncx file
because ncx yes xml file , Need to combine xml2js （ obtain json.ncx.navMap） Convert it to json
epub.flow Inside the array is the display order of the e-book catalog , But this is not necessarily the actual directory , It's better to combine json.ncx.navMap.navPoint Go find the catalog
Finally, add some useful information to chapter Last push To array , The contents are as follows ：

The directory is also hierarchical , For example, there is a first section in the first chapter , There is a first subsection in the first section , So we need to use flatten Method to flatten it . According to this idea, we can write by hand ES10 Of flat Method , With one more reduce edition （ You can also use the trinocular operator , Then there is only one line ）

const arr = [1, [2, [3, [4, [5]], [6]], [7]]]
console.log(arr.flat(Infinity)) // [1, 2, 3, 4, 5, 6, 7]

function flatten(arr) {
      
  return [].concat(
    ...arr.map(item => {
      
      if (Array.isArray(item)) return [].concat(...flatten(item))
      return item
    })
  )
}
console.log(flatten(arr)) // [1, 2, 3, 4, 5, 6, 7]

function flattenReduce(arr) {
      
  return arr.reduce((acc, cur) => {
      
    if (Array.isArray(cur)) return acc.concat(flattenReduce(cur))
    return acc.concat(cur)
  }, [])
}
console.log(flattenReduce(arr)) // [1, 2, 3, 4, 5, 6, 7]

Deal with the contents , Format according to el-tree Data format

const xml2js = require('xml2js').parseString

class Book {
    
  //...
  parseContents(epub) {
    
    function getNcxFilePath() {
    
      const spine = epub && epub.spine
      const manifest = epub && epub.manifest
      const ncx = spine.toc && spine.toc.href
      const id = spine.toc && spine.toc.id
      if (ncx) return ncx
      return manifest[id].href
    }

    function findParent(array, level = 0, pid = '') {
    
      return array.map(item => {
    
        item.level = level
        item.pid = pid
        if (item.navPoint && item.navPoint.length > 0) {
    
          item.navPoint = findParent(item.navPoint, level + 1, item['$'].id)
        } else if (item.navPoint) {
    
          item.navPoint.level = level + 1
          item.navPoint.pid = item['$'].id
        }
        return item
      })
    }

    function flatten(array) {
    
      return [].concat(
        ...array.map(item => {
    
          if (item.navPoint && item.navPoint.length > 0) {
    
            return [].concat(item, ...flatten(item.navPoint))
          } else if (item.navPoint) {
    
            return [].concat(item, item.navPoint)
          }
          return item
        })
      )
    }

    const ncxFilePath = Book.genPath(`${
      this.unzipPath}/${
      getNcxFilePath()}`) //  obtain ncx File path 
    if (fs.existsSync(ncxFilePath)) {
    
      return new Promise((resolve, reject) => {
    
        const xml = fs.readFileSync(ncxFilePath, 'utf-8') //  Read ncx file 
        const filename = this.fileName
        //  take ncx File from the xml To json
        xml2js(
          xml,
          {
    
            explicitArray: false, //  Set to false when , The parsing result will not wrap array
            ignoreAttrs: false, //  Resolving properties 
          },
          (err, json) => {
    
            if (err) reject(err)
            const navMap = json.ncx.navMap
            if (navMap.navPoint && navMap.navPoint.length > 0) {
    
              navMap.navPoint = findParent(navMap.navPoint)
              const newNavMap = flatten(navMap.navPoint) //  Split the directory into flat structures 
              const chapters = []
              newNavMap.forEach((chapter, index) => {
    
                const src = chapter.content['$'].src
                chapter.id = `${
      src}`
                chapter.href = `${
      dir}/${
      src}`.replace(this.unzipPath, '')
                chapter.text = `${
      UPLOAD_URL}${
      dir}/${
      src}`
                chapter.label = chapter.navLabel.text || ''
                chapter.navId = chapter['$'].id
                chapter.fileName = filename
                chapter.order = index + 1
                chapters.push(chapter)
              })
              const chapterTree = []
              chapters.forEach(c => {
    
                c.children = []
                if (c.pid === '') {
    
                  chapterTree.push(c)
                } else {
    
                  const parent = chapters.find(_ => _.navId === c.pid)
                  parent.children.push(c)
                }
              })
              resolve({
     chapters, chapterTree })
            } else {
    
              reject(new Error(' Directory resolution failed , The number of directories is 0'))
            }
          }
        )
      })
    } else {
    
      throw new Error(' Catalog file does not exist ')
    }
  }
}