IndexError when using chatdocs add command for some documents #99

niclbw · 2024-01-03T01:08:54Z

I am having some trouble adding some documents and cannot seem to find a solution to this. The traceback error is as follows (done on Anaconda Prompt):

┌─────────────────────────────── Traceback (most recent call last) ────────────────────────────────┐
│ D:\Users\INTERNET-USER\miniconda3\envs\chatdocs_env\lib\site-packages\chatdocs\main.py:40 in add │
│ │
│ 37 │ from .add import add │
│ 38 │ │
│ 39 │ config = get_config(config) │
│ > 40 │ add(config=config, source_directory=str(directory)) │
│ 41 │
│ 42 │
│ 43 @app.command() │
│ │
│ ┌───────────────────────────────────── locals ─────────────────────────────────────┐ │
│ │ add = <function add at 0x0000021F750468C0> │ │
│ │ config = { │ │
│ │ │ 'embeddings': {'model': 'hkunlp/instructor-large'}, │ │
│ │ │ 'llm': 'ctransformers', │ │
│ │ │ 'ctransformers': { │ │
│ │ │ │ 'model': 'TheBloke/Wizard-Vicuna-7B-Uncensored-GGML', │ │
│ │ │ │ 'model_file': 'Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_0.bin', │ │
│ │ │ │ 'model_type': 'llama', │ │
│ │ │ │ 'config': {'context_length': 1024, 'max_new_tokens': 256} │ │
│ │ │ }, │ │
│ │ │ 'huggingface': { │ │
│ │ │ │ 'model': 'TheBloke/Wizard-Vicuna-7B-Uncensored-HF', │ │
│ │ │ │ 'pipeline_kwargs': {'max_new_tokens': 256} │ │
│ │ │ }, │ │
│ │ │ 'gptq': { │ │
│ │ │ │ 'model': 'TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ', │ │
│ │ │ │ 'model_file': 'model.safetensors', │ │
│ │ │ │ 'pipeline_kwargs': {'max_new_tokens': 256} │ │
│ │ │ }, │ │
│ │ │ 'download': False, │ │
│ │ │ 'host': 'localhost', │ │
│ │ │ 'port': 5000, │ │
│ │ │ 'auth': False, │ │
│ │ │ 'chroma': { │ │
│ │ │ │ 'persist_directory': 'db', │ │
│ │ │ │ 'chroma_db_impl': 'duckdb+parquet', │ │
│ │ │ │ 'anonymized_telemetry': False │ │
│ │ │ }, │ │
│ │ │ ... +1 │ │
│ │ } │ │
│ │ directory = WindowsPath('testdocs') │ │
│ └──────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ D:\Users\INTERNET-USER\miniconda3\envs\chatdocs_env\lib\site-packages\chatdocs\add.py:160 in add │
│ │
│ 157 │ │ print("Creating new vectorstore") │
│ 158 │ │ texts = process_documents(source_directory) │
│ 159 │ │ print(f"Creating embeddings. May take a few minutes...") │
│ > 160 │ │ db = get_vectorstore_from_documents(config, texts) │
│ 161 │ db.persist() │
│ 162 │ db = None │
│ 163 │
│ │
│ ┌───────────────────────────────────────── locals ─────────────────────────────────────────┐ │
│ │ config = { │ │
│ │ │ 'embeddings': {'model': 'hkunlp/instructor-large'}, │ │
│ │ │ 'llm': 'ctransformers', │ │
│ │ │ 'ctransformers': { │ │
│ │ │ │ 'model': 'TheBloke/Wizard-Vicuna-7B-Uncensored-GGML', │ │
│ │ │ │ 'model_file': 'Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_0.bin', │ │
│ │ │ │ 'model_type': 'llama', │ │
│ │ │ │ 'config': {'context_length': 1024, 'max_new_tokens': 256} │ │
│ │ │ }, │ │
│ │ │ 'huggingface': { │ │
│ │ │ │ 'model': 'TheBloke/Wizard-Vicuna-7B-Uncensored-HF', │ │
│ │ │ │ 'pipeline_kwargs': {'max_new_tokens': 256} │ │
│ │ │ }, │ │
│ │ │ 'gptq': { │ │
│ │ │ │ 'model': 'TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ', │ │
│ │ │ │ 'model_file': 'model.safetensors', │ │
│ │ │ │ 'pipeline_kwargs': {'max_new_tokens': 256} │ │
│ │ │ }, │ │
│ │ │ 'download': False, │ │
│ │ │ 'host': 'localhost', │ │
│ │ │ 'port': 5000, │ │
│ │ │ 'auth': False, │ │
│ │ │ 'chroma': { │ │
│ │ │ │ 'persist_directory': 'db', │ │
│ │ │ │ 'chroma_db_impl': 'duckdb+parquet', │ │
│ │ │ │ 'anonymized_telemetry': False │ │
│ │ │ }, │ │
│ │ │ ... +1 │ │
│ │ } │ │
│ │ persist_directory = 'db' │ │
│ │ source_directory = 'testdocs' │ │
│ │ texts = [] │ │
│ └──────────────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ D:\Users\INTERNET-USER\miniconda3\envs\chatdocs_env\lib\site-packages\chatdocs\vectorstores.py:2 │
│ 7 in get_vectorstore_from_documents │
│ │
│ 24 ) -> VectorStore: │
│ 25 │ embeddings = get_embeddings(config) │
│ 26 │ config = config["chroma"] │
│ > 27 │ return Chroma.from_documents( │
│ 28 │ │ documents, │
│ 29 │ │ embeddings, │
│ 30 │ │ persist_directory=config["persist_directory"], │
│ │
│ ┌─────────────────────────────────────────── locals ───────────────────────────────────────────┐ │
│ │ config = { │ │
│ │ │ 'persist_directory': 'db', │ │
│ │ │ 'chroma_db_impl': 'duckdb+parquet', │ │
│ │ │ 'anonymized_telemetry': False │ │
│ │ } │ │
│ │ documents = [] │ │
│ │ embeddings = HuggingFaceInstructEmbeddings( │ │
│ │ │ client=INSTRUCTOR( │ │
│ │ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with │ │
│ │ Transformer model: T5EncoderModel │ │
│ │ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': │ │
│ │ False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, │ │
│ │ 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': │ │
│ │ False, 'pooling_mode_lasttoken': False}) │ │
│ │ (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': False, │ │
│ │ 'activation_function': 'torch.nn.modules.linear.Identity'}) │ │
│ │ (3): Normalize() │ │
│ │ ), │ │
│ │ │ model_name='hkunlp/instructor-large', │ │
│ │ │ cache_folder=None, │ │
│ │ │ model_kwargs={}, │ │
│ │ │ encode_kwargs={}, │ │
│ │ │ embed_instruction='Represent the document for retrieval: ', │ │
│ │ │ query_instruction='Represent the question for retrieving supporting │ │
│ │ documents: ' │ │
│ │ ) │ │
│ └──────────────────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ D:\Users\INTERNET-USER\miniconda3\envs\chatdocs_env\lib\site-packages\langchain_community\vector │
│ stores\chroma.py:771 in from_documents │
│ │
│ 768 │ │ """ │
│ 769 │ │ texts = [doc.page_content for doc in documents] │
│ 770 │ │ metadatas = [doc.metadata for doc in documents] │
│ > 771 │ │ return cls.from_texts( │
│ 772 │ │ │ texts=texts, │
│ 773 │ │ │ embedding=embedding, │
│ 774 │ │ │ metadatas=metadatas, │
│ │
│ ┌─────────────────────────────────────────── locals ───────────────────────────────────────────┐ │
│ │ client = None │ │
│ │ client_settings = Settings( │ │
│ │ │ environment='', │ │
│ │ │ chroma_db_impl='duckdb+parquet', │ │
│ │ │ chroma_api_impl='chromadb.api.local.LocalAPI', │ │
│ │ │ chroma_telemetry_impl='chromadb.telemetry.posthog.Posthog', │ │
│ │ │ chroma_sysdb_impl='chromadb.db.impl.sqlite.SqliteDB', │ │
│ │ │ chroma_producer_impl='chromadb.db.impl.sqlite.SqliteDB', │ │
│ │ │ chroma_consumer_impl='chromadb.db.impl.sqlite.SqliteDB', │ │
│ │ │ │ │
│ │ chroma_segment_manager_impl='chromadb.segment.impl.manager.local.Loca… │ │
│ │ │ clickhouse_host=None, │ │
│ │ │ clickhouse_port=None, │ │
│ │ │ tenant_id='default', │ │
│ │ │ topic_namespace='default', │ │
│ │ │ persist_directory='db', │ │
│ │ │ chroma_server_host=None, │ │
│ │ │ chroma_server_http_port=None, │ │
│ │ │ chroma_server_ssl_enabled=False, │ │
│ │ │ chroma_server_grpc_port=None, │ │
│ │ │ chroma_server_cors_allow_origins=[], │ │
│ │ │ anonymized_telemetry=False, │ │
│ │ │ allow_reset=False, │ │
│ │ │ sqlite_database=':memory:', │ │
│ │ │ migrations='apply' │ │
│ │ ) │ │
│ │ cls = <class 'langchain_community.vectorstores.chroma.Chroma'> │ │
│ │ collection_metadata = None │ │
│ │ collection_name = 'langchain' │ │
│ │ documents = [] │ │
│ │ embedding = HuggingFaceInstructEmbeddings( │ │
│ │ │ client=INSTRUCTOR( │ │
│ │ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) │ │
│ │ with Transformer model: T5EncoderModel │ │
│ │ (1): Pooling({'word_embedding_dimension': 768, │ │
│ │ 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, │ │
│ │ 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': │ │
│ │ False, 'pooling_mode_weightedmean_tokens': False, │ │
│ │ 'pooling_mode_lasttoken': False}) │ │
│ │ (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': False, │ │
│ │ 'activation_function': 'torch.nn.modules.linear.Identity'}) │ │
│ │ (3): Normalize() │ │
│ │ ), │ │
│ │ │ model_name='hkunlp/instructor-large', │ │
│ │ │ cache_folder=None, │ │
│ │ │ model_kwargs={}, │ │
│ │ │ encode_kwargs={}, │ │
│ │ │ embed_instruction='Represent the document for retrieval: ', │ │
│ │ │ query_instruction='Represent the question for retrieving │ │
│ │ supporting documents: ' │ │
│ │ ) │ │
│ │ ids = None │ │
│ │ kwargs = {} │ │
│ │ metadatas = [] │ │
│ │ persist_directory = 'db' │ │
│ │ texts = [] │ │
│ └──────────────────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ D:\Users\INTERNET-USER\miniconda3\envs\chatdocs_env\lib\site-packages\langchain_community\vector │
│ stores\chroma.py:735 in from_texts │
│ │
│ 732 │ │ │ │ │ ids=batch[0], │
│ 733 │ │ │ │ ) │
│ 734 │ │ else: │
│ > 735 │ │ │ chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids) │
│ 736 │ │ return chroma_collection │
│ 737 │ │
│ 738 │ @classmethod │
│ │
│ ┌─────────────────────────────────────────── locals ───────────────────────────────────────────┐ │
│ │ chroma_collection = <langchain_community.vectorstores.chroma.Chroma object at │ │
│ │ 0x0000021F7507FBB0> │ │
│ │ client = None │ │
│ │ client_settings = Settings( │ │
│ │ │ environment='', │ │
│ │ │ chroma_db_impl='duckdb+parquet', │ │
│ │ │ chroma_api_impl='chromadb.api.local.LocalAPI', │ │
│ │ │ chroma_telemetry_impl='chromadb.telemetry.posthog.Posthog', │ │
│ │ │ chroma_sysdb_impl='chromadb.db.impl.sqlite.SqliteDB', │ │
│ │ │ chroma_producer_impl='chromadb.db.impl.sqlite.SqliteDB', │ │
│ │ │ chroma_consumer_impl='chromadb.db.impl.sqlite.SqliteDB', │ │
│ │ │ │ │
│ │ chroma_segment_manager_impl='chromadb.segment.impl.manager.local.Loca… │ │
│ │ │ clickhouse_host=None, │ │
│ │ │ clickhouse_port=None, │ │
│ │ │ tenant_id='default', │ │
│ │ │ topic_namespace='default', │ │
│ │ │ persist_directory='db', │ │
│ │ │ chroma_server_host=None, │ │
│ │ │ chroma_server_http_port=None, │ │
│ │ │ chroma_server_ssl_enabled=False, │ │
│ │ │ chroma_server_grpc_port=None, │ │
│ │ │ chroma_server_cors_allow_origins=[], │ │
│ │ │ anonymized_telemetry=False, │ │
│ │ │ allow_reset=False, │ │
│ │ │ sqlite_database=':memory:', │ │
│ │ │ migrations='apply' │ │
│ │ ) │ │
│ │ cls = <class 'langchain_community.vectorstores.chroma.Chroma'> │ │
│ │ collection_metadata = None │ │
│ │ collection_name = 'langchain' │ │
│ │ embedding = HuggingFaceInstructEmbeddings( │ │
│ │ │ client=INSTRUCTOR( │ │
│ │ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) │ │
│ │ with Transformer model: T5EncoderModel │ │
│ │ (1): Pooling({'word_embedding_dimension': 768, │ │
│ │ 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, │ │
│ │ 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': │ │
│ │ False, 'pooling_mode_weightedmean_tokens': False, │ │
│ │ 'pooling_mode_lasttoken': False}) │ │
│ │ (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': False, │ │
│ │ 'activation_function': 'torch.nn.modules.linear.Identity'}) │ │
│ │ (3): Normalize() │ │
│ │ ), │ │
│ │ │ model_name='hkunlp/instructor-large', │ │
│ │ │ cache_folder=None, │ │
│ │ │ model_kwargs={}, │ │
│ │ │ encode_kwargs={}, │ │
│ │ │ embed_instruction='Represent the document for retrieval: ', │ │
│ │ │ query_instruction='Represent the question for retrieving │ │
│ │ supporting documents: ' │ │
│ │ ) │ │
│ │ ids = [] │ │
│ │ kwargs = {} │ │
│ │ metadatas = [] │ │
│ │ persist_directory = 'db' │ │
│ │ texts = [] │ │
│ └──────────────────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ D:\Users\INTERNET-USER\miniconda3\envs\chatdocs_env\lib\site-packages\langchain_community\vector │
│ stores\chroma.py:275 in add_texts │
│ │
│ 272 │ │ embeddings = None │
│ 273 │ │ texts = list(texts) │
│ 274 │ │ if self._embedding_function is not None: │
│ > 275 │ │ │ embeddings = self._embedding_function.embed_documents(texts) │
│ 276 │ │ if metadatas: │
│ 277 │ │ │ # fill metadatas with empty dicts if somebody │
│ 278 │ │ │ # did not specify metadata for all texts │
│ │
│ ┌────────────────────────────────────────── locals ──────────────────────────────────────────┐ │
│ │ embeddings = None │ │
│ │ ids = [] │ │
│ │ kwargs = {} │ │
│ │ metadatas = [] │ │
│ │ self = <langchain_community.vectorstores.chroma.Chroma object at 0x0000021F7507FBB0> │ │
│ │ texts = [] │ │
│ └────────────────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ D:\Users\INTERNET-USER\miniconda3\envs\chatdocs_env\lib\site-packages\langchain_community\embedd │
│ ings\huggingface.py:170 in embed_documents │
│ │
│ 167 │ │ │ List of embeddings, one for each text. │
│ 168 │ │ """ │
│ 169 │ │ instruction_pairs = [[self.embed_instruction, text] for text in texts] │
│ > 170 │ │ embeddings = self.client.encode(instruction_pairs, **self.encode_kwargs) │
│ 171 │ │ return embeddings.tolist() │
│ 172 │ │
│ 173 │ def embed_query(self, text: str) -> List[float]: │
│ │
│ ┌─────────────────────────────────────────── locals ───────────────────────────────────────────┐ │
│ │ instruction_pairs = [] │ │
│ │ self = HuggingFaceInstructEmbeddings( │ │
│ │ │ client=INSTRUCTOR( │ │
│ │ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with │ │
│ │ Transformer model: T5EncoderModel │ │
│ │ (1): Pooling({'word_embedding_dimension': 768, │ │
│ │ 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, │ │
│ │ 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': │ │
│ │ False, 'pooling_mode_weightedmean_tokens': False, │ │
│ │ 'pooling_mode_lasttoken': False}) │ │
│ │ (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': False, │ │
│ │ 'activation_function': 'torch.nn.modules.linear.Identity'}) │ │
│ │ (3): Normalize() │ │
│ │ ), │ │
│ │ │ model_name='hkunlp/instructor-large', │ │
│ │ │ cache_folder=None, │ │
│ │ │ model_kwargs={}, │ │
│ │ │ encode_kwargs={}, │ │
│ │ │ embed_instruction='Represent the document for retrieval: ', │ │
│ │ │ query_instruction='Represent the question for retrieving supporting │ │
│ │ documents: ' │ │
│ │ ) │ │
│ │ texts = [] │ │
│ └──────────────────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ D:\Users\INTERNET-USER\AppData\Roaming\Python\Python310\site-packages\InstructorEmbedding\instru │
│ ctor.py:527 in encode │
│ │
│ 524 │ │ │
│ 525 │ │ all_embeddings = [] │
│ 526 │ │ #try: │
│ > 527 │ │ if isinstance(sentences[0],list): │
│ 528 │ │ │ │ lengths = [] │
│ 529 │ │ │ │ for sen in sentences: │
│ 530 │ │ │ │ │ lengths.append(-self._text_length(sen[1])) │
│ │
│ ┌─────────────────────────────────────────── locals ───────────────────────────────────────────┐ │
│ │ all_embeddings = [] │ │
│ │ batch_size = 32 │ │
│ │ convert_to_numpy = True │ │
│ │ convert_to_tensor = False │ │
│ │ device = device(type='cpu') │ │
│ │ input_was_string = False │ │
│ │ normalize_embeddings = False │ │
│ │ output_value = 'sentence_embedding' │ │
│ │ self = INSTRUCTOR( │ │
│ │ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) │ │
│ │ with Transformer model: T5EncoderModel │ │
│ │ (1): Pooling({'word_embedding_dimension': 768, │ │
│ │ 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, │ │
│ │ 'pooling_mode_max_tokens': False, │ │
│ │ 'pooling_mode_mean_sqrt_len_tokens': False, │ │
│ │ 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': │ │
│ │ False}) │ │
│ │ (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': │ │
│ │ False, 'activation_function': 'torch.nn.modules.linear.Identity'}) │ │
│ │ (3): Normalize() │ │
│ │ ) │ │
│ │ sentences = [] │ │
│ │ show_progress_bar = False │ │
│ └──────────────────────────────────────────────────────────────────────────────────────────────┘ │
└──────────────────────────────────────────────────────────────────────────────────────────────────┘
IndexError: list index out of range

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

IndexError when using chatdocs add command for some documents #99

IndexError when using chatdocs add command for some documents #99

niclbw commented Jan 3, 2024

IndexError when using chatdocs add command for some documents #99

IndexError when using chatdocs add command for some documents #99

Comments

niclbw commented Jan 3, 2024