Add:Scanner extracts cover from comic files #1837 and ComicInfo.xml parser

This commit is contained in:
advplyr 2024-01-14 17:51:26 -06:00
parent e76af3bfc2
commit f5545cd3f4
10 changed files with 762 additions and 4 deletions

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2018 ნიკა
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,262 @@
/**
* Modified from https://github.com/nika-begiashvili/libarchivejs
*/
const Path = require('path')
const { Worker } = require('worker_threads')
/**
* Represents compressed file before extraction
*/
class CompressedFile {
constructor(name, size, path, archiveRef) {
this._name = name
this._size = size
this._path = path
this._archiveRef = archiveRef
}
/**
* file name
*/
get name() {
return this._name
}
/**
* file size
*/
get size() {
return this._size
}
/**
* Extract file from archive
* @returns {Promise<File>} extracted file
*/
extract() {
return this._archiveRef.extractSingleFile(this._path)
}
}
class Archive {
/**
* Creates new archive instance from browser native File object
* @param {Buffer} fileBuffer
* @param {object} options
* @returns {Archive}
*/
static open(fileBuffer) {
const arch = new Archive(fileBuffer, { workerUrl: Path.join(__dirname, 'libarchiveWorker.js') })
return arch.open()
}
/**
* Create new archive
* @param {File} file
* @param {Object} options
*/
constructor(file, options) {
this._worker = new Worker(options.workerUrl)
this._worker.on('message', this._workerMsg.bind(this))
this._callbacks = []
this._content = {}
this._processed = 0
this._file = file
}
/**
* Prepares file for reading
* @returns {Promise<Archive>} archive instance
*/
async open() {
await this._postMessage({ type: 'HELLO' }, (resolve, reject, msg) => {
if (msg.type === 'READY') {
resolve()
}
})
return await this._postMessage({ type: 'OPEN', file: this._file }, (resolve, reject, msg) => {
if (msg.type === 'OPENED') {
resolve(this)
}
})
}
/**
* Terminate worker to free up memory
*/
close() {
this._worker.terminate()
this._worker = null
}
/**
* detect if archive has encrypted data
* @returns {boolean|null} null if could not be determined
*/
hasEncryptedData() {
return this._postMessage({ type: 'CHECK_ENCRYPTION' },
(resolve, reject, msg) => {
if (msg.type === 'ENCRYPTION_STATUS') {
resolve(msg.status)
}
}
)
}
/**
* set password to be used when reading archive
*/
usePassword(archivePassword) {
return this._postMessage({ type: 'SET_PASSPHRASE', passphrase: archivePassword },
(resolve, reject, msg) => {
if (msg.type === 'PASSPHRASE_STATUS') {
resolve(msg.status)
}
}
)
}
/**
* Returns object containing directory structure and file information
* @returns {Promise<object>}
*/
getFilesObject() {
if (this._processed > 0) {
return Promise.resolve().then(() => this._content)
}
return this._postMessage({ type: 'LIST_FILES' }, (resolve, reject, msg) => {
if (msg.type === 'ENTRY') {
const entry = msg.entry
const [target, prop] = this._getProp(this._content, entry.path)
if (entry.type === 'FILE') {
target[prop] = new CompressedFile(entry.fileName, entry.size, entry.path, this)
}
return true
} else if (msg.type === 'END') {
this._processed = 1
resolve(this._cloneContent(this._content))
}
})
}
getFilesArray() {
return this.getFilesObject().then((obj) => {
return this._objectToArray(obj)
})
}
extractSingleFile(target) {
// Prevent extraction if worker already terminated
if (this._worker === null) {
throw new Error("Archive already closed")
}
return this._postMessage({ type: 'EXTRACT_SINGLE_FILE', target: target },
(resolve, reject, msg) => {
if (msg.type === 'FILE') {
resolve(msg.entry)
}
}
)
}
/**
* Returns object containing directory structure and extracted File objects
* @param {Function} extractCallback
*
*/
extractFiles(extractCallback) {
if (this._processed > 1) {
return Promise.resolve().then(() => this._content)
}
return this._postMessage({ type: 'EXTRACT_FILES' }, (resolve, reject, msg) => {
if (msg.type === 'ENTRY') {
const [target, prop] = this._getProp(this._content, msg.entry.path)
if (msg.entry.type === 'FILE') {
target[prop] = msg.entry
if (extractCallback !== undefined) {
setTimeout(extractCallback.bind(null, {
file: target[prop],
path: msg.entry.path,
}))
}
}
return true
} else if (msg.type === 'END') {
this._processed = 2
this._worker.terminate()
resolve(this._cloneContent(this._content))
}
})
}
_cloneContent(obj) {
if (obj instanceof CompressedFile || obj === null) return obj
const o = {}
for (const prop of Object.keys(obj)) {
o[prop] = this._cloneContent(obj[prop])
}
return o
}
_objectToArray(obj, path = '') {
const files = []
for (const key of Object.keys(obj)) {
if (obj[key] instanceof CompressedFile || obj[key] === null) {
files.push({
file: obj[key] || key,
path: path
})
} else {
files.push(...this._objectToArray(obj[key], `${path}${key}/`))
}
}
return files
}
_getProp(obj, path) {
const parts = path.split('/')
if (parts[parts.length - 1] === '') parts.pop()
let cur = obj, prev = null
for (const part of parts) {
cur[part] = cur[part] || {}
prev = cur
cur = cur[part]
}
return [prev, parts[parts.length - 1]]
}
_postMessage(msg, callback) {
this._worker.postMessage(msg)
return new Promise((resolve, reject) => {
this._callbacks.push(this._msgHandler.bind(this, callback, resolve, reject))
})
}
_msgHandler(callback, resolve, reject, msg) {
if (!msg) {
reject('invalid msg')
return
}
if (msg.type === 'BUSY') {
reject('worker is busy')
} else if (msg.type === 'ERROR') {
reject(msg.error)
} else {
return callback(resolve, reject, msg)
}
}
_workerMsg(msg) {
const callback = this._callbacks[this._callbacks.length - 1]
const next = callback(msg)
if (!next) {
this._callbacks.pop()
}
}
}
module.exports = Archive

View File

@ -0,0 +1,72 @@
/**
* Modified from https://github.com/nika-begiashvili/libarchivejs
*/
const { parentPort } = require('worker_threads')
const { getArchiveReader } = require('./wasm-module')
let reader = null
let busy = false
getArchiveReader((_reader) => {
reader = _reader
busy = false
parentPort.postMessage({ type: 'READY' })
})
parentPort.on('message', async msg => {
if (busy) {
parentPort.postMessage({ type: 'BUSY' })
return
}
let skipExtraction = false
busy = true
try {
switch (msg.type) {
case 'HELLO': // module will respond READY when it's ready
break
case 'OPEN':
await reader.open(msg.file)
parentPort.postMessage({ type: 'OPENED' })
break
case 'LIST_FILES':
skipExtraction = true
// eslint-disable-next-line no-fallthrough
case 'EXTRACT_FILES':
for (const entry of reader.entries(skipExtraction)) {
parentPort.postMessage({ type: 'ENTRY', entry })
}
parentPort.postMessage({ type: 'END' })
break
case 'EXTRACT_SINGLE_FILE':
for (const entry of reader.entries(true, msg.target)) {
if (entry.fileData) {
parentPort.postMessage({ type: 'FILE', entry })
}
}
break
case 'CHECK_ENCRYPTION':
parentPort.postMessage({ type: 'ENCRYPTION_STATUS', status: reader.hasEncryptedData() })
break
case 'SET_PASSPHRASE':
reader.setPassphrase(msg.passphrase)
parentPort.postMessage({ type: 'PASSPHRASE_STATUS', status: true })
break
default:
throw new Error('Invalid Command')
}
} catch (err) {
parentPort.postMessage({
type: 'ERROR',
error: {
message: err.message,
name: err.name,
stack: err.stack
}
})
} finally {
// eslint-disable-next-line require-atomic-updates
busy = false
}
})

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,235 @@
/**
* Modified from https://github.com/nika-begiashvili/libarchivejs
*/
const Path = require('path')
const libarchive = require('./wasm-libarchive')
const TYPE_MAP = {
32768: 'FILE',
16384: 'DIR',
40960: 'SYMBOLIC_LINK',
49152: 'SOCKET',
8192: 'CHARACTER_DEVICE',
24576: 'BLOCK_DEVICE',
4096: 'NAMED_PIPE',
}
class ArchiveReader {
/**
* archive reader
* @param {WasmModule} wasmModule emscripten module
*/
constructor(wasmModule) {
this._wasmModule = wasmModule
this._runCode = wasmModule.runCode
this._file = null
this._passphrase = null
}
/**
* open archive, needs to closed manually
* @param {File} file
*/
open(file) {
if (this._file !== null) {
console.warn('Closing previous file')
this.close()
}
const { promise, resolve, reject } = this._promiseHandles()
this._file = file
this._loadFile(file, resolve, reject)
return promise
}
/**
* close archive
*/
close() {
this._runCode.closeArchive(this._archive)
this._wasmModule._free(this._filePtr)
this._file = null
this._filePtr = null
this._archive = null
}
/**
* detect if archive has encrypted data
* @returns {boolean|null} null if could not be determined
*/
hasEncryptedData() {
this._archive = this._runCode.openArchive(this._filePtr, this._fileLength, this._passphrase)
this._runCode.getNextEntry(this._archive)
const status = this._runCode.hasEncryptedEntries(this._archive)
if (status === 0) {
return false
} else if (status > 0) {
return true
} else {
return null
}
}
/**
* set passphrase to be used with archive
* @param {*} passphrase
*/
setPassphrase(passphrase) {
this._passphrase = passphrase
}
/**
* get archive entries
* @param {boolean} skipExtraction
* @param {string} except don't skip this entry
*/
*entries(skipExtraction = false, except = null) {
this._archive = this._runCode.openArchive(this._filePtr, this._fileLength, this._passphrase)
let entry
while (true) {
entry = this._runCode.getNextEntry(this._archive)
if (entry === 0) break
const entryData = {
size: this._runCode.getEntrySize(entry),
path: this._runCode.getEntryName(entry),
type: TYPE_MAP[this._runCode.getEntryType(entry)],
ref: entry,
}
if (entryData.type === 'FILE') {
let fileName = entryData.path.split('/')
entryData.fileName = fileName[fileName.length - 1]
}
if (skipExtraction && except !== entryData.path) {
this._runCode.skipEntry(this._archive)
} else {
const ptr = this._runCode.getFileData(this._archive, entryData.size)
if (ptr < 0) {
throw new Error(this._runCode.getError(this._archive))
}
entryData.fileData = this._wasmModule.HEAP8.slice(ptr, ptr + entryData.size)
this._wasmModule._free(ptr)
}
yield entryData
}
}
_loadFile(fileBuffer, resolve, reject) {
try {
const array = new Uint8Array(fileBuffer)
this._fileLength = array.length
this._filePtr = this._runCode.malloc(this._fileLength)
this._wasmModule.HEAP8.set(array, this._filePtr)
resolve()
} catch (error) {
reject(error)
}
}
_promiseHandles() {
let resolve = null, reject = null
const promise = new Promise((_resolve, _reject) => {
resolve = _resolve
reject = _reject
})
return { promise, resolve, reject }
}
}
class WasmModule {
constructor() {
this.preRun = []
this.postRun = []
this.totalDependencies = 0
}
print(...text) {
console.log(text)
}
printErr(...text) {
console.error(text)
}
initFunctions() {
this.runCode = {
// const char * get_version()
getVersion: this.cwrap('get_version', 'string', []),
// void * archive_open( const void * buffer, size_t buffer_size)
// retuns archive pointer
openArchive: this.cwrap('archive_open', 'number', ['number', 'number', 'string']),
// void * get_entry(void * archive)
// return archive entry pointer
getNextEntry: this.cwrap('get_next_entry', 'number', ['number']),
// void * get_filedata( void * archive, size_t bufferSize )
getFileData: this.cwrap('get_filedata', 'number', ['number', 'number']),
// int archive_read_data_skip(struct archive *_a)
skipEntry: this.cwrap('archive_read_data_skip', 'number', ['number']),
// void archive_close( void * archive )
closeArchive: this.cwrap('archive_close', null, ['number']),
// la_int64_t archive_entry_size( struct archive_entry * )
getEntrySize: this.cwrap('archive_entry_size', 'number', ['number']),
// const char * archive_entry_pathname( struct archive_entry * )
getEntryName: this.cwrap('archive_entry_pathname', 'string', ['number']),
// __LA_MODE_T archive_entry_filetype( struct archive_entry * )
/*
#define AE_IFMT ((__LA_MODE_T)0170000)
#define AE_IFREG ((__LA_MODE_T)0100000) // Regular file
#define AE_IFLNK ((__LA_MODE_T)0120000) // Sybolic link
#define AE_IFSOCK ((__LA_MODE_T)0140000) // Socket
#define AE_IFCHR ((__LA_MODE_T)0020000) // Character device
#define AE_IFBLK ((__LA_MODE_T)0060000) // Block device
#define AE_IFDIR ((__LA_MODE_T)0040000) // Directory
#define AE_IFIFO ((__LA_MODE_T)0010000) // Named pipe
*/
getEntryType: this.cwrap('archive_entry_filetype', 'number', ['number']),
// const char * archive_error_string(struct archive *);
getError: this.cwrap('archive_error_string', 'string', ['number']),
/*
* Returns 1 if the archive contains at least one encrypted entry.
* If the archive format not support encryption at all
* ARCHIVE_READ_FORMAT_ENCRYPTION_UNSUPPORTED is returned.
* If for any other reason (e.g. not enough data read so far)
* we cannot say whether there are encrypted entries, then
* ARCHIVE_READ_FORMAT_ENCRYPTION_DONT_KNOW is returned.
* In general, this function will return values below zero when the
* reader is uncertain or totally incapable of encryption support.
* When this function returns 0 you can be sure that the reader
* supports encryption detection but no encrypted entries have
* been found yet.
*
* NOTE: If the metadata/header of an archive is also encrypted, you
* cannot rely on the number of encrypted entries. That is why this
* function does not return the number of encrypted entries but#
* just shows that there are some.
*/
// __LA_DECL int archive_read_has_encrypted_entries(struct archive *);
entryIsEncrypted: this.cwrap('archive_entry_is_encrypted', 'number', ['number']),
hasEncryptedEntries: this.cwrap('archive_read_has_encrypted_entries', 'number', ['number']),
// __LA_DECL int archive_read_add_passphrase(struct archive *, const char *);
addPassphrase: this.cwrap('archive_read_add_passphrase', 'number', ['number', 'string']),
//this.stringToUTF(str), //
string: (str) => this.allocate(this.intArrayFromString(str), 'i8', 0),
malloc: this.cwrap('malloc', 'number', ['number']),
free: this.cwrap('free', null, ['number']),
}
}
monitorRunDependencies() { }
locateFile(path /* ,prefix */) {
const wasmFilepath = Path.join(__dirname, `../../../client/dist/libarchive/wasm-gen/${path}`)
return wasmFilepath
}
}
module.exports.getArchiveReader = (cb) => {
libarchive(new WasmModule()).then((module) => {
module.initFunctions()
cb(new ArchiveReader(module))
})
}

View File

@ -681,7 +681,7 @@ class BookScanner {
const bookTitle = this.bookMetadata.title || this.libraryItemData.mediaMetadata.title
AudioFileScanner.setBookMetadataFromAudioMetaTags(bookTitle, this.audioFiles, this.bookMetadata, this.libraryScan)
} else if (this.ebookFileScanData) {
const ebookMetdataObject = this.ebookFileScanData.metadata
const ebookMetdataObject = this.ebookFileScanData.metadata || {}
for (const key in ebookMetdataObject) {
if (key === 'tags') {
if (ebookMetdataObject.tags.length) {

View File

@ -0,0 +1,35 @@
/**
* TODO: Add more fields
* @see https://anansi-project.github.io/docs/comicinfo/intro
*
* @param {Object} comicInfoJson
* @returns {import('../../scanner/BookScanner').BookMetadataObject}
*/
module.exports.parse = (comicInfoJson) => {
if (!comicInfoJson?.ComicInfo) return null
const ComicSeries = comicInfoJson.ComicInfo.Series?.[0]?.trim() || null
const ComicNumber = comicInfoJson.ComicInfo.Number?.[0]?.trim() || null
const ComicSummary = comicInfoJson.ComicInfo.Summary?.[0]?.trim() || null
let title = null
const series = []
if (ComicSeries) {
series.push({
name: ComicSeries,
sequence: ComicNumber
})
title = ComicSeries
if (ComicNumber) {
title += ` ${ComicNumber}`
}
}
return {
title,
series,
description: ComicSummary
}
}

View File

@ -0,0 +1,109 @@
const Path = require('path')
const globals = require('../globals')
const fs = require('../../libs/fsExtra')
const Logger = require('../../Logger')
const Archive = require('../../libs/libarchive/archive')
const { xmlToJSON } = require('../index')
const parseComicInfoMetadata = require('./parseComicInfoMetadata')
/**
*
* @param {string} filepath
* @returns {Promise<Buffer>}
*/
async function getComicFileBuffer(filepath) {
if (!await fs.pathExists(filepath)) {
Logger.error(`Comic path does not exist "${filepath}"`)
return null
}
try {
return fs.readFile(filepath)
} catch (error) {
Logger.error(`Failed to read comic at "${filepath}"`, error)
return null
}
}
/**
* Extract cover image from comic return true if success
*
* @param {string} comicPath
* @param {string} comicImageFilepath
* @param {string} outputCoverPath
* @returns {Promise<boolean>}
*/
async function extractCoverImage(comicPath, comicImageFilepath, outputCoverPath) {
const comicFileBuffer = await getComicFileBuffer(comicPath)
if (!comicFileBuffer) return null
const archive = await Archive.open(comicFileBuffer)
const fileEntry = await archive.extractSingleFile(comicImageFilepath)
if (!fileEntry?.fileData) {
Logger.error(`[parseComicMetadata] Invalid file entry data for comicPath "${comicPath}"/${comicImageFilepath}`)
return false
}
try {
await fs.writeFile(outputCoverPath, fileEntry.fileData)
return true
} catch (error) {
Logger.error(`[parseComicMetadata] Failed to extract image from comicPath "${comicPath}"`, error)
return false
}
}
module.exports.extractCoverImage = extractCoverImage
/**
* Parse metadata from comic
*
* @param {import('../../models/Book').EBookFileObject} ebookFile
* @returns {Promise<import('./parseEbookMetadata').EBookFileScanData>}
*/
async function parse(ebookFile) {
const comicPath = ebookFile.metadata.path
Logger.debug(`Parsing metadata from comic at "${comicPath}"`)
const comicFileBuffer = await getComicFileBuffer(comicPath)
if (!comicFileBuffer) return null
const archive = await Archive.open(comicFileBuffer)
const fileObjects = await archive.getFilesArray()
fileObjects.sort((a, b) => {
return a.file.name.localeCompare(b.file.name, undefined, {
numeric: true,
sensitivity: 'base'
})
})
let metadata = null
const comicInfo = fileObjects.find(fo => fo.file.name === 'ComicInfo.xml')
if (comicInfo) {
const comicInfoEntry = await comicInfo.file.extract()
if (comicInfoEntry?.fileData) {
const comicInfoStr = new TextDecoder().decode(comicInfoEntry.fileData)
const comicInfoJson = await xmlToJSON(comicInfoStr)
if (comicInfoJson) {
metadata = parseComicInfoMetadata.parse(comicInfoJson)
}
}
}
const payload = {
path: comicPath,
ebookFormat: ebookFile.ebookFormat,
metadata
}
const firstImage = fileObjects.find(fo => globals.SupportedImageTypes.includes(Path.extname(fo.file.name).toLowerCase().slice(1)))
if (firstImage?.file?._path) {
payload.ebookCoverPath = firstImage.file._path
} else {
Logger.warn(`Cover image not found in comic at "${comicPath}"`)
}
return payload
}
module.exports.parse = parse

View File

@ -1,4 +1,5 @@
const parseEpubMetadata = require('./parseEpubMetadata')
const parseComicMetadata = require('./parseComicMetadata')
/**
* @typedef EBookFileScanData
@ -18,7 +19,9 @@ async function parse(ebookFile) {
if (!ebookFile) return null
if (ebookFile.ebookFormat === 'epub') {
return parseEpubMetadata.parse(ebookFile.metadata.path)
return parseEpubMetadata.parse(ebookFile)
} else if (['cbz', 'cbr'].includes(ebookFile.ebookFormat)) {
return parseComicMetadata.parse(ebookFile)
}
return null
}
@ -36,6 +39,8 @@ async function extractCoverImage(ebookFileScanData, outputCoverPath) {
if (ebookFileScanData.ebookFormat === 'epub') {
return parseEpubMetadata.extractCoverImage(ebookFileScanData.path, ebookFileScanData.ebookCoverPath, outputCoverPath)
} else if (['cbz', 'cbr'].includes(ebookFileScanData.ebookFormat)) {
return parseComicMetadata.extractCoverImage(ebookFileScanData.path, ebookFileScanData.ebookCoverPath, outputCoverPath)
}
return false
}

View File

@ -60,10 +60,11 @@ module.exports.extractCoverImage = extractCoverImage
/**
* Parse metadata from epub
*
* @param {string} epubPath
* @param {import('../../models/Book').EBookFileObject} ebookFile
* @returns {Promise<import('./parseEbookMetadata').EBookFileScanData>}
*/
async function parse(epubPath) {
async function parse(ebookFile) {
const epubPath = ebookFile.metadata.path
Logger.debug(`Parsing metadata from epub at "${epubPath}"`)
// Entrypoint of the epub that contains the filepath to the package document (opf file)
const containerJson = await extractXmlToJson(epubPath, 'META-INF/container.xml')