⌈iOS⌋对 WKWebView 截图并生成视频的一种姿势

或许你有过这样的使用体验：当你看完某 App 的年度报告之后，会有一个交互按钮支持你生成视频并保存到相册。当你点击并经过一定的等待时间之后，你会发现相册里多了一段内容为自动翻页的年度报告视频。

谓之何解？照 WKWebView 且使动之为之一途。

WKWebView 截图

毫无疑问，经历过 WKWebView 截长图需求的开发者都会觉得这是个不简单的话题。好消息是我们这里只探讨对当前显示区域的截图，借助其特有的相关 API，很容易办到：

// iOS 11+
let config = WKSnapshotConfiguration
// 设置截图的最终宽度，如果显示区域与之有出入，将会等比例缩放。
config.snapshotWidth = width
webView.takeSnapshot(with: config) { image, _ in }

现在，我们不妨将时序和截图逻辑封装起来，向使用方不断输出帧序列，并告知当前的帧索引：

class FrameSampler: NSObject {
    let url: URL
    /// 容器大小
    let size: CGSize
    let timeInterval: DispatchTimeInterval
    /// 主线程回调
    let completion: Completion
    init(
        url: URL,
        size: CGSize,
        timeInterval: DispatchTimeInterval,
        completion: @escaping Completion
    ) {
        self.url = url
        self.size = size
        self.timeInterval = timeInterval
        self.completion = completion
        super.init()
    }

    func sample(ticker: @escaping (UIImage, Int) -> Void) {
        _ticker = ticker
        _index = 0

        // 加载 WKWebView
    }

    private var _ticker: ((UIImage, Int) -> Void)?
    private var _index = 0
    private lazy var _timer: DispatchSourceTimer = {
        let width = size.width
        let result = DispatchSource.makeTimerSource(queue: .main)
        result.schedule(deadline: .now(), repeating: timeInterval)
        result.setEventHandler { [weak self] in
            let config = WKSnapshotConfiguration()
            config.snapshotWidth = width
            self?._webView.takeSnapshot(with: config) { image, _ in
                guard let self else { return }
                self._ticker?(image, self._index)
                self._index += 1
            }
        }
        return result
    }
}

extension FrameSampler: WKNavigationDelegate {
    func webView(
        _ webView: WKWebView,
        decidePolicyFor navigationAction: WKNavigationAction,
        decisionHandler: @escaping (WKNavigationActionPolicy) -> Void
    ) {
        let url = navigationAction.request.url?.absoluteString ?? ""

        // 根据 url 决定什么时候开始，什么时候结束，什么时候取消
        guard let action = _Action(rawValue: url) else {
            decisionHandler(.allow); return
        }

        switch action {
        case .start:
            _timer.activate()
        case .stop:
            _reset()
            completion(.success(()))
        case .cancel:
            _reset()
            completion(.failure(.frontendCancelled))
        }

        decisionHandler(.cancel)
    }

    private func _reset() {
        _timer.cancel()
        _ticker = nil
        _webView.removeFromSuperview()
    }

    private enum _Action: String {
        case start = "YourStartURL"
        case stop = "YourStopURL"
        case cancel = "YourCancelURL"
    }
}

视频合成

在上一步中，我们得到了想要帧序列，现在只需要将其按时间顺序导出为动画并合成音频即可大功告成。对于音频需要注意的是，其长度和视频本身不一定对等。故在例程中当音频长度小于视频时，就循环插入音轨。

这里，我们先拆分整个视频合成流程中涉及到的角色：

CIContext：它将通过 Metal 将 UIImage 转换为 CVPixelBuffer
AVAssetWriterInput、AVAssetWriterInputPixelBufferAdaptor 和 AVAssetWriter：按照时序将帧序列合成动画
AVMutableComposition：将动画和音频合成为视频
AVAssetExportSession：当视频文件导出到指定路径

UIImage to CVPixelBuffer

class VideoComposer {
    // 如果为 nil，可让 VideoComposer 初始化失败
    private var context = MTLCreateSystemDefaultDevice().map { CIContext(mtlDevice: $0) }

    private func _compose(image: UIImage, index: Int64) {
        guard
            videoInput.isReadyForMoreMediaData,
            let pool = videoAdaptor.pixelBufferPool,
            let cgImage = image.cgImage
        else { return }
        let bitmap = CIImage(cgImage: cgImage)
        let present = CMTimeMake(value: index, timescale: frameRate)
        guard let buffer = bitmap.toPixelBuffer(context: context, pool: pool) else { return }
        videoAdaptor.append(buffer, withPresentationTime: present)
    }
}

extension CIImage {
    func toPixelBuffer(
        context: CIContext,
        pool: CVPixelBufferPool
    ) -> CVPixelBuffer? {
        var buffer: CVPixelBuffer?
        CVPixelBufferPoolCreatePixelBuffer(kCFAllocatorDefault, pool, &buffer)
        guard let buffer = buffer else { return nil }
        context.render(self, to: buffer)
        return buffer
    }
}

Init AVAssetWriterInput、AVAssetWriterInputPixelBufferAdaptor、AVAssetWriter


init?(many parameters) {
    guard
        let preset = AVOutputSettingsAssistant.availableOutputSettingsPresets().first,
        var videoSettings = AVOutputSettingsAssistant(preset: preset)?.videoSettings
    else { return nil }
    videoSettings[AVVideoWidthKey] = width as NSNumber
    videoSettings[AVVideoHeightKey] = height as NSNumber

    videoInput = AVAssetWriterInput(
        mediaType: .video,
        outputSettings: videoSettings
    )
    // ⚠️ Important
    // To ensure optimal behavior,
    // don’t set the value of this property and performsMultiPassEncodingIfSupported to true at the same time.
    // videoInput.performsMultiPassEncodingIfSupported = true
    videoInput.expectsMediaDataInRealTime = true
    videoAdaptor = AVAssetWriterInputPixelBufferAdaptor(
        assetWriterInput: videoInput,
        sourcePixelBufferAttributes: [
            kCVPixelBufferWidthKey as String: width as NSNumber,
            kCVPixelBufferHeightKey as String: height as NSNumber,
            kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32ARGB as NSNumber,
        ]
    )

    let writer = try? AVAssetWriter(outputURL: tmpVideoURL, fileType: .mp4)
    guard let writer = writer, writer.canAdd(videoInput) else { return nil }
    writer.add(videoInput)

    guard writer.startWriting() else { return nil }
    writer.startSession(atSourceTime: .zero)
}

Merge audio track and export

在帧序列处理完成之后，进行音轨的合成：

private func _finish(index: Int64, completion: @escaping Completion) {
    videoInput.markAsFinished()
    writer.endSession(atSourceTime: CMTimeMake(value: index, timescale: frameRate))
    writer.finishWriting { [weak self] in
        self?._merge(completion: completion)
    }
}

private func _merge(completion: @escaping Completion) {
    guard let audioURL = audioURL else {
        try? FileManager.default.copyItem(at: tmpVideoURL, to: outputURL)
        completion(.success(()))
        return
    }

    let videoAsset = AVAsset(url: tmpVideoURL)
    let audioAsset = AVAsset(url: audioURL)
    let composition = AVMutableComposition()

    guard
        let cVideoTrack = composition.addMutableTrack(
            withMediaType: .video,
            preferredTrackID: kCMPersistentTrackID_Invalid
        ),
        let cAudioTrack = composition.addMutableTrack(
            withMediaType: .audio,
            preferredTrackID: kCMPersistentTrackID_Invalid
        ),
        let aVideoTrack = videoAsset.tracks(withMediaType: .video).first,
        let aAudioTrack = audioAsset.tracks(withMediaType: .audio).first
    else {
        completion(.failure(.exportFailed("音视频通道读取失败")))
        return
    }

    cVideoTrack.preferredTransform = videoAsset.preferredTransform

    let videoDuration = videoAsset.duration
    let audioDuration = audioAsset.duration

    do {
        try cVideoTrack.insertTimeRange(
            CMTimeRange(start: .zero, duration: videoDuration),
            of: aVideoTrack,
            at: .zero
        )
    } catch {
        completion(.failure(.exportFailed("视频通道合成失败：\(error.localizedDescription)")))
        return
    }

    let audioTimeScale = aAudioTrack.naturalTimeScale
    var timestamps: [(CMTimeRange, CMTime)] = []
    // 如果视频比音频长，则循环插入音轨。
    if videoDuration.seconds > audioDuration.seconds {
        let cycleCount = videoDuration.seconds / audioDuration.seconds
        let cycleCountInt = Int(cycleCount)
        for index in 0..<cycleCountInt {
            let startTime = CMTime(
                seconds: audioDuration.seconds * Double(index),
                preferredTimescale: audioTimeScale
            )
            timestamps.append((CMTimeRange(start: .zero, duration: audioDuration), startTime))
        }

        let reseted = (cycleCount - Double(cycleCountInt)) * audioDuration.seconds
        let timeRange = CMTimeRange(
            start: .zero,
            duration: CMTime(seconds: reseted, preferredTimescale: audioTimeScale)
        )
        let startTime = CMTime(
            seconds: audioDuration.seconds * Double(cycleCountInt),
            preferredTimescale: audioTimeScale
        )
        timestamps.append((timeRange, startTime))
    } else {
        timestamps.append((CMTimeRange(start: .zero, duration: videoDuration), .zero))
    }

    for timestamp in timestamps {
        do {
            try cAudioTrack.insertTimeRange(timestamp.0, of: aAudioTrack, at: timestamp.1)
        } catch {
            completion(.failure(.exportFailed("音频通道合成失败：\(error.localizedDescription)")))
            return
        }
    }

    guard
        let exportSession = AVAssetExportSession(asset: composition, presetName: AVAssetExportPresetHighestQuality)
    else {
        completion(.failure(.exportFailed("视频导出会话创建失败")))
        return
    }
    exportSession.outputFileType = outputType
    exportSession.outputURL = outputURL
    exportSession.shouldOptimizeForNetworkUse = true
    exportSession.exportAsynchronously { [weak exportSession] in
        guard exportSession?.status == .completed else {
            completion(.failure(.exportFailed(exportSession?.error?.localizedDescription ?? "未知错误")))
            return
        }
        completion(.success(()))
    }
}

局限性

由于帧序列的处理是耗时操作，所以需要放在后台线程中进行。但是帧序列会不断的产生，一定存在消耗不及时的情况。为了避免这种情况，需要在异步串行队列中调用 _compose 和 _finish：

private lazy var operationQueue: OperationQueue = {
    let result = OperationQueue()
    // ⚠️ 需要指定 qos，避免在主线程中执行，导致和主线程 WKWebView 死锁。
    result.qualityOfService = .userInitiated
    // ⚠️ 不能开启并发，AVAssetWriterInput 和 AVAssetWriterInputPixelBufferAdaptor 非线程安全。
    result.maxConcurrentOperationCount = 1
    return result
}()

func compose(image: UIImage, index: Int64) {
    operationQueue.addOperation { [weak self] in
        self?._compose(image: image, index: index)
    }
}

/// 主线程回调
func finish(index: Int64, completion: @escaping Completion) {
    operationQueue.addOperation { [weak self] in
        let completion = { result in DispatchQueue.main.async { completion(result) } }
        self?._finish(index: index, completion: completion)
    }
}

如果帧率和分辨率要求过高，当前设备吞吐速度无法满足，就会不断有任务被 pending，最终导致 App 因 Out of Memory 被系统终止运行。

总结

相比于需要对视频标记进行加工的第三方框架方案，对网页进行截图并合成音轨生成视频无疑是更为简单和通用的方案。不过在你接受它之前，需要先考虑它的“创作”时间和在不同时代设备上的兼容问题。如果你要求过高，它随时可能撂挑子～