from docarray import DocumentArray, Document
from jina import Executor
docs = DocumentArray(
[
Document(text="It was a dark and stormy night."),
Document(text="What do a raven and a writing desk have in common?"),
Document(text="Turn to p.13 to read about J.R.R. Tolkien pinging google.com in 3.4 seconds")
]
)
exec = Executor.from_hub("jinahub://Sentencizer")
exec.segment(docs, parameters={})
for doc in docs:
for chunk in doc.chunks:
print(chunk.text)
print("---")
from docarray import DocumentArray, Document
from jina import Executor
docs = DocumentArray(
[
Document(text="It was a dark and stormy night."),
Document(text="What do a raven and a writing desk have in common?"),
Document(text="Turn to p.13 to read about J.R.R. Tolkien pinging google.com in 3.4 seconds")
]
)
exec = Executor.from_hub("jinahub://SpacySentencizer")
exec.segment(docs, parameters={})
for doc in docs:
for chunk in doc.chunks:
print(chunk.text)
print("---")
现在的结果如下图所示:
将 Executor 添加到 Flow 中:
from docarray import DocumentArray
from jina import Flow
docs = DocumentArray.from_files("data/*.pdf", recursive=True)
flow = (
Flow()
.add(uses="jinahub+sandbox://PDFSegmenter", install_requirements=True, name="segmenter")
.add(uses=ChunkSentencizer, name="chunk_sentencizer")
)
with flow:
indexed_docs = flow.index(docs)
3.2 对图像进行归一化处理
from jina import Executor, requests
import numpy as np
class ImageNormalizer(Executor):
@requests(on="/index")
def normalize_chunks(self, docs, **kwargs):
for doc in docs:
for chunk in doc.chunks[...]:
if chunk.blob:
chunk.convert_blob_to_image_tensor()
if hasattr(chunk, "tensor"):
if chunk.tensor is not None:
chunk.convert_image_tensor_to_uri()
chunk.tags["image_datauri"] = chunk.uri
chunk.tensor = chunk.tensor.astype(np.uint8)
chunk.set_image_tensor_shape((64, 64))
chunk.set_image_tensor_normalization()
12-18: 假设出现张量,我们需要把未处理张量的数据 uri 添加到元数据(即 tags)中,以便于后续检索并在前端展示图像。
为了防止文本块与图像块互相干扰:
from docarray import DocumentArray
from jina import Flow
docs = DocumentArray.from_files("data/*.pdf", recursive=True)
flow = (
Flow()
.add(uses="jinahub+sandbox://PDFSegmenter", install_requirements=True, name="segmenter")
.add(uses=ChunkSentencizer, name="chunk_sentencizer")
.add(uses=ImageNormalizer, name="image_normalizer")
)
with flow:
indexed_docs = flow.index(docs)
通过上述过程,我们实现了:
* 构建一个全新的 PDF
* 将 PDF 分成文本和图像两部分
* 进一步将文本块分割成句子块
* 对图像进行归一化处理
效果如下图所示:
通过一个新的 Executor--ChunkMerger,将文本块和图像块放在同一个 level:
from jina import Executor, requests
import numpy as np
class ImageNormalizer(Executor):
@requests(on="/index")
def normalize_chunks(self, docs, **kwargs):
...
class ChunkMerger(Executor):
@requests(on="/index")
def merge_chunks(self, docs, **kwargs):
for doc in docs: # level 0 document
for chunk in doc.chunks:
if doc.text:
docs.pop(chunk.id)
doc.chunks = doc.chunks[...]
完成分句 (sentencize) 后,将其直接放到 Flow 中,代码如下:
from docarray import DocumentArray
from executors import ChunkSentencizer, ChunkMerger, ImageNormalizer
from jina import Flow
docs = DocumentArray.from_files("data/*.pdf", recursive=True)
flow = (
Flow()
.add(uses="jinahub+sandbox://PDFSegmenter", install_requirements=True, name="segmenter")
.add(uses=ChunkSentencizer, name="chunk_sentencizer")
.add(uses=ChunkMerger, name="chunk_merger")
.add(uses=ImageNormalizer, name="image_normalizer")
)
with flow:
indexed_docs = flow.index(docs)
以上就是本系列教程 Part 1 的全部内容。在 Part 2 中,我们将为 Flow 添加一个编码器,使用 CLIP 将文本和图像编码为向量,从而简化的语义搜索的过程。