Chat on Tabular Data¶
TableGPT Agent excels at analyzing and processing tabular data. To perform data analysis, you need to first let the agent "see" the dataset. This is done by a specific "file-reading" workflow. In short, you begin by "uploading" the dataset and let the agent read it. Once the data is read, you can ask the agent questions about it.
To learn more about the file-reading workflow, see File Reading.
For data analysis tasks, we introduce two important parameters when creating the agent: checkpointer
and session_id
.
- The
checkpointer
should be an instance oflanggraph.checkpoint.base.BaseCheckpointSaver
, which acts as a versioned "memory" for the agent. (See langgraph's persistence concept for more details.) - The
session_id
is a unique identifier for the current session. It ties the agent's execution to a specific kernel, ensuring that the agent's results are retained across multiple invocations.
In [1]:
Copied!
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import MemorySaver
from pybox import AsyncLocalPyBoxManager
from tablegpt import DEFAULT_TABLEGPT_IPYKERNEL_PROFILE_DIR
from tablegpt.agent import create_tablegpt_graph
llm = ChatOpenAI(openai_api_base="YOUR_VLLM_URL", openai_api_key="whatever", model_name="TableGPT2-7B")
pybox_manager = AsyncLocalPyBoxManager(profile_dir=DEFAULT_TABLEGPT_IPYKERNEL_PROFILE_DIR)
checkpointer = MemorySaver()
agent = create_tablegpt_graph(
llm=llm,
pybox_manager=pybox_manager,
checkpointer=checkpointer,
session_id="some-session-id", # This is required when using file-reading
)
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import MemorySaver
from pybox import AsyncLocalPyBoxManager
from tablegpt import DEFAULT_TABLEGPT_IPYKERNEL_PROFILE_DIR
from tablegpt.agent import create_tablegpt_graph
llm = ChatOpenAI(openai_api_base="YOUR_VLLM_URL", openai_api_key="whatever", model_name="TableGPT2-7B")
pybox_manager = AsyncLocalPyBoxManager(profile_dir=DEFAULT_TABLEGPT_IPYKERNEL_PROFILE_DIR)
checkpointer = MemorySaver()
agent = create_tablegpt_graph(
llm=llm,
pybox_manager=pybox_manager,
checkpointer=checkpointer,
session_id="some-session-id", # This is required when using file-reading
)
Add the file for processing in the additional_kwargs of HumanMessage. Here's an example using the Titanic dataset.
In [2]:
Copied!
from typing import TypedDict
from langchain_core.messages import HumanMessage
class Attachment(TypedDict):
"""Contains at least one dictionary with the key filename."""
filename: str
attachment_msg = HumanMessage(
content="",
# Please make sure your iPython kernel can access your filename.
additional_kwargs={"attachments": [Attachment(filename="titanic.csv")]},
)
from typing import TypedDict
from langchain_core.messages import HumanMessage
class Attachment(TypedDict):
"""Contains at least one dictionary with the key filename."""
filename: str
attachment_msg = HumanMessage(
content="",
# Please make sure your iPython kernel can access your filename.
additional_kwargs={"attachments": [Attachment(filename="titanic.csv")]},
)
Invoke the agent as shown in the quick start:
In [3]:
Copied!
from datetime import date
from tablegpt.agent.file_reading import Stage
# Reading and processing files.
response = await agent.ainvoke(
input={
"entry_message": attachment_msg,
"processing_stage": Stage.UPLOADED,
"messages": [attachment_msg],
"parent_id": "some-parent-id1",
"date": date.today(),
},
config={
# Using checkpointer requires binding thread_id at runtime.
"configurable": {"thread_id": "some-thread-id"},
},
)
response["messages"]
from datetime import date
from tablegpt.agent.file_reading import Stage
# Reading and processing files.
response = await agent.ainvoke(
input={
"entry_message": attachment_msg,
"processing_stage": Stage.UPLOADED,
"messages": [attachment_msg],
"parent_id": "some-parent-id1",
"date": date.today(),
},
config={
# Using checkpointer requires binding thread_id at runtime.
"configurable": {"thread_id": "some-thread-id"},
},
)
response["messages"]
Out[3]:
[HumanMessage(content='', additional_kwargs={'attachments': [{'filename': 'titanic.csv'}]}, response_metadata={}, id='ab0a7157-ad7d-4de8-9b24-1bee78ad7c55'), AIMessage(content="我已经收到您的数据文件,我需要查看文件内容以对数据集有一个初步的了解。首先我会读取数据到 `df` 变量中,并通过 `df.info` 查看 NaN 情况和数据类型。\n```python\n# Load the data into a DataFrame\ndf = read_df('titanic.csv')\n\n# Remove leading and trailing whitespaces in column names\ndf.columns = df.columns.str.strip()\n\n# Remove rows and columns that contain only empty values\ndf = df.dropna(how='all').dropna(axis=1, how='all')\n\n# Get the basic information of the dataset\ndf.info(memory_usage=False)\n```", additional_kwargs={'parent_id': 'some-parent-id1', 'thought': '我已经收到您的数据文件,我需要查看文件内容以对数据集有一个初步的了解。首先我会读取数据到 `df` 变量中,并通过 `df.info` 查看 NaN 情况和数据类型。', 'action': {'tool': 'python', 'tool_input': "# Load the data into a DataFrame\ndf = read_df('titanic.csv')\n\n# Remove leading and trailing whitespaces in column names\ndf.columns = df.columns.str.strip()\n\n# Remove rows and columns that contain only empty values\ndf = df.dropna(how='all').dropna(axis=1, how='all')\n\n# Get the basic information of the dataset\ndf.info(memory_usage=False)"}, 'model_type': None}, response_metadata={}, id='add6691d-d7ea-411d-9699-e99ae0b7de97', tool_calls=[{'name': 'python', 'args': {'query': "# Load the data into a DataFrame\ndf = read_df('titanic.csv')\n\n# Remove leading and trailing whitespaces in column names\ndf.columns = df.columns.str.strip()\n\n# Remove rows and columns that contain only empty values\ndf = df.dropna(how='all').dropna(axis=1, how='all')\n\n# Get the basic information of the dataset\ndf.info(memory_usage=False)"}, 'id': 'b846aa01-04ef-4669-9a5c-53ddcb9a2dfb', 'type': 'tool_call'}]), ToolMessage(content=[{'type': 'text', 'text': "```pycon\n<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 4 entries, 0 to 3\nData columns (total 8 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 Pclass 4 non-null int64 \n 1 Sex 4 non-null object \n 2 Age 4 non-null float64\n 3 SibSp 4 non-null int64 \n 4 Parch 4 non-null int64 \n 5 Fare 4 non-null float64\n 6 Embarked 4 non-null object \n 7 Survived 4 non-null int64 \ndtypes: float64(2), int64(4), object(2)\n```"}], name='python', id='0d441b21-bff3-463c-a07f-c0b12bd17bc5', tool_call_id='b846aa01-04ef-4669-9a5c-53ddcb9a2dfb', artifact=[]), AIMessage(content='接下来我将用 `df.head(5)` 来查看数据集的前 5 行。\n```python\n# Show the first 5 rows to understand the structure\ndf.head(5)\n```', additional_kwargs={'parent_id': 'some-parent-id1', 'thought': '接下来我将用 `df.head(5)` 来查看数据集的前 5 行。', 'action': {'tool': 'python', 'tool_input': '# Show the first 5 rows to understand the structure\ndf.head(5)'}, 'model_type': None}, response_metadata={}, id='5e26ef1d-7042-471e-b39f-194a51a185c7', tool_calls=[{'name': 'python', 'args': {'query': '# Show the first 5 rows to understand the structure\ndf.head(5)'}, 'id': 'f6be0d96-05b3-4b5b-8313-90197a8c3d87', 'type': 'tool_call'}]), ToolMessage(content=[{'type': 'text', 'text': '```pycon\n Pclass Sex Age SibSp Parch Fare Embarked Survived\n0 2 female 29.0 0 2 23.000 S 1\n1 3 female 39.0 1 5 31.275 S 0\n2 3 male 26.5 0 0 7.225 C 0\n3 3 male 32.0 0 0 56.496 S 1\n```'}], name='python', id='6fc6d8aa-546c-467e-91d3-d57b0b62dd68', tool_call_id='f6be0d96-05b3-4b5b-8313-90197a8c3d87', artifact=[]), AIMessage(content='我已经了解了数据集 titanic.csv 的基本信息。请问我可以帮您做些什么?', additional_kwargs={'parent_id': 'some-parent-id1'}, response_metadata={}, id='b6dc3885-94cb-4b0f-b691-f37c4c8c9ba3')]
Continue to ask questions for data analysis:
In [4]:
Copied!
human_message = HumanMessage(content="How many men survived?")
async for event in agent.astream_events(
input={
# After using checkpoint, you only need to add new messages here.
"messages": [human_message],
"parent_id": "some-parent-id2",
"date": date.today(),
},
version="v2",
# We configure the same thread_id to use checkpoints to retrieve the memory of the last run.
config={"configurable": {"thread_id": "some-thread-id"}},
):
event_name: str = event["name"]
evt: str = event["event"]
if evt == "on_chat_model_end":
print(event["data"]["output"])
elif event_name == "tool_node" and evt == "on_chain_stream":
for lc_msg in event["data"]["chunk"]["messages"]:
print(lc_msg)
else:
# Other events can be handled here.
pass
human_message = HumanMessage(content="How many men survived?")
async for event in agent.astream_events(
input={
# After using checkpoint, you only need to add new messages here.
"messages": [human_message],
"parent_id": "some-parent-id2",
"date": date.today(),
},
version="v2",
# We configure the same thread_id to use checkpoints to retrieve the memory of the last run.
config={"configurable": {"thread_id": "some-thread-id"}},
):
event_name: str = event["name"]
evt: str = event["event"]
if evt == "on_chat_model_end":
print(event["data"]["output"])
elif event_name == "tool_node" and evt == "on_chain_stream":
for lc_msg in event["data"]["chunk"]["messages"]:
print(lc_msg)
else:
# Other events can be handled here.
pass
content="为了回答您的问题,我将筛选出所有男性乘客并计算其中的幸存者数量。\n```python\n# Filter male passengers who survived and count them\nmale_survivors = df[(df['Sex'] == 'male') & (df['Survived'] == 1)]\nmale_survivors_count = male_survivors.shape[0]\nmale_survivors_count\n```" additional_kwargs={} response_metadata={'finish_reason': 'stop', 'model_name': 'TableGPT2-7B'} id='run-661d7496-341d-4a6b-84d8-b4094db66ef0' content=[{'type': 'text', 'text': '```pycon\n1\n```'}] name='python' id='1c7531db-9150-451d-a8dd-f07176454e6f' tool_call_id='2860e8bb-0fa7-421b-bb2d-bfeca873354b' artifact=[] content='根据数据集,有 1 名男性乘客幸存。' additional_kwargs={} response_metadata={'finish_reason': 'stop', 'model_name': 'TableGPT2-7B'} id='run-db640705-0085-4f47-adb4-3e0adce694cd'