[QUESTION] Some problem with the bbox_pixels #189

LYXFOREVER · 2024-11-23T10:54:25Z

I try to store the history list to make a dataset, but I met a problem

I drew the UI boundaries corresponding to the screenshot onto the image and also plotted the actions. The coordinates of the actions represent the centers of the UI elements. I found that the action coordinates are accurate, but the UI boundaries look a bit off. Even though the coordinates are obtained through the bbox_pixels of the UIElement, why is there such a discrepancy?

crawles · 2024-11-23T13:28:29Z

Is it possible it is a plotting error? Can you share your code you used to plot this?

LYXFOREVER · 2024-11-24T03:05:02Z

I store the history as .pkl file in minimal_task_runner.py:

with open(path, 'wb') as file:
pickle.dump(agent.history, file)

Then in another read_history.py file I read the .pkl file:

def _plot_dual_point(
touch_x,
touch_y,
ax,
):
"""Plots a dual point action on the given matplotlib axis."""
print("本次是点击动作,点击坐标",touch_x,touch_y)
ax.scatter(
touch_x,
touch_y,
s=550,
linewidths=5,
color=_ACTION_COLOR,
marker='+',
)
return ax

def _get_annotation_positions(
element_list
):
"""Processes the annotation positions into distinct bounding boxes.

Args:
    element_list:由UIElement对象组成的列表

Returns:
    A matrix of annotation positions with dimensions (# of annotations, 4),
    where each annotation bounding box takes the form (y, x, h, w).
"""
positions = []

for ui in element_list:
    # 记得筛选掉看不见的ui
    if ui.bbox_pixels is not None and ui.is_visible:
        #首先要获取左下角的那个点的位置
        x = ui.bbox_pixels.x_min
        y = ui.bbox_pixels.y_max
        h = ui.bbox_pixels.y_max - ui.bbox_pixels.y_min
        w = ui.bbox_pixels.x_max - ui.bbox_pixels.x_min
        positions.append((y,x,h,w))

return positions

def _add_text(
text, screen_width, screen_height, ax
):
"""Plots text on the given matplotlib axis."""
t = ax.text(
0.5 * screen_width,
0.95 * screen_height,
text,
color='white',
size=20,
horizontalalignment='center',
verticalalignment='center',
)
t.set_bbox(dict(facecolor=_ACTION_COLOR, alpha=0.9))

def _plot_action(
action, # 通过action.action_type来获取操作类型
screen_height,
screen_width,
ax,
bbox = None
):
"""Plots the example's action on the given matplotlib axis."""
if action.action_type == 'click':
print(bbox)
x,y = bbox.center
return _plot_dual_point(
x, y, ax
)
elif action.action_type == 'input_text':
text = action.text
_add_text(text, screen_width, screen_height, ax)
print(bbox)
x,y = bbox.center
return _plot_dual_point(
x, y, ax
)
else:
pass # 其他的暂时不画

def show_anno_and_action_img():
pkl_name = 'raw_history_pkl/2024_11_22_15_45_19_history.pkl'
with open(pkl_name, 'rb') as f:
history = pickle.load(f)

par_doc_name = 'processed_history/'
current_time = datetime.now()# get current time
formatted_time = current_time.strftime("%Y_%m_%d_%H_%M_%S")
doc_name = par_doc_name + formatted_time

# 创建本个pkl文件的文件夹
folder_path = Path(doc_name)
if not folder_path.exists():
    folder_path.mkdir(parents=True)  # create doc
    print(f"文件夹 '{folder_path}' 已创建")
else:
    print(f"文件夹 '{folder_path}' 已存在")

for i, step in enumerate(history):
    # 确认本次操作的主角
    image = step['before_screenshot']
    image_height = image.shape[0]
    image_width = image.shape[1]

    # 设置画布
    #print('设置画布')
    _, ax = plt.subplots(figsize=(8, 8))
    #print('设置中')
    ax.imshow(image)
    #print('设置ok')

    # 画上动作  首先是取出动作
    action_output = step['action_output']
    _, action = m3a_utils.parse_reason_action_output(action_output)
    try:
        converted_action = json_action.JSONAction(
            **agent_utils.extract_json(action),
        )
    except Exception as e:  # pylint: disable=broad-exception-caught
        print('Failed to convert the output to a valid action.')
        print(str(e))

    # 把动作画到图片里
    if converted_action.action_type == 'click' or converted_action.action_type == 'input_text':
        print("本次操作element:",step['before_element_list'][converted_action.index])
        bbox = step['before_element_list'][converted_action.index].bbox_pixels
        _plot_action(converted_action,image_height,image_width,ax,bbox)
    else:
        _plot_action(converted_action,image_height,image_width,ax)

    # 画出ui边界 首先是得到ui边界
    positions = _get_annotation_positions(step['before_element_list'])
    # 画上去
    for y, x, h, w in positions:
        rect = patches.Rectangle(
            (x, y), w, h, linewidth=1, edgecolor='r', facecolor='none'
        )
        ax.add_patch(rect)

    # 最后得到处理后的image
    plt.tight_layout()
    img_path = doc_name + '/' + str(i) + '.png'
    plt.savefig(img_path)

show_anno_and_action_img()

I am a beginner, and my code is a bit messy. I hope you can excuse me!

I followed the method in AITW to draw the bounding boxes and actions on the app screenshots. These functions seem to work fine on the AITW dataset.

crawles · 2024-11-25T01:47:31Z

NP. I am unable to figure out exactly from your code. We have some plotting code that may help you.

from android_world.utils import plotting
from android_world.env import interface

checkpointer = checkpointer_lib.IncrementalCheckpointer(YOUR_PATH)
episodes = checkpointer.load()
episode = episodes[0]
with open('/tmp/result.html', 'w') as f:
  # T3A.
  # data = m3a_utils.generate_single_task_html_for_gpt4_text(episode)
  # M3A.
  data = m3a_utils.generate_single_task_html_for_m3a(episode)

  f.write(data)

from android_world.utils import plotting
from android_world.env import interface

checkpointer = checkpointer_lib.IncrementalCheckpointer(YOUR_PATH)
episodes = checkpointer.load()
episode = episodes[0]
print(episode['episode_data'].keys())
screenshot = episode['episode_data']['before_screenshot'][0]
ui_elements = episode['episode_data']['before_element_list'][0]

state = interface.State(pixels=screenshot, ui_elements=ui_elements, forest=None)

axs = plotting.plot_ui_elements(state)

LYXFOREVER · 2024-11-25T01:55:19Z

NP. I am unable to figure out exactly from your code. We have some plotting code that may help you.

from android_world.utils import plotting from android_world.env import interface

checkpointer = checkpointer_lib.IncrementalCheckpointer(YOUR_PATH)
episodes = checkpointer.load()
episode = episodes[0]
with open('/tmp/result.html', 'w') as f:
  # T3A.
  # data = m3a_utils.generate_single_task_html_for_gpt4_text(episode)
  # M3A.
  data = m3a_utils.generate_single_task_html_for_m3a(episode)

  f.write(data)

from android_world.utils import plotting
from android_world.env import interface

checkpointer = checkpointer_lib.IncrementalCheckpointer(YOUR_PATH)
episodes = checkpointer.load()
episode = episodes[0]
print(episode['episode_data'].keys())
screenshot = episode['episode_data']['before_screenshot'][0]
ui_elements = episode['episode_data']['before_element_list'][0]

state = interface.State(pixels=screenshot, ui_elements=ui_elements, forest=None)

axs = plotting.plot_ui_elements(state)

May I ask what is checkpointer_lib?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[QUESTION] Some problem with the bbox_pixels #189

[QUESTION] Some problem with the bbox_pixels #189

LYXFOREVER commented Nov 23, 2024

crawles commented Nov 23, 2024

LYXFOREVER commented Nov 24, 2024

crawles commented Nov 25, 2024

LYXFOREVER commented Nov 25, 2024

[QUESTION] Some problem with the bbox_pixels #189

[QUESTION] Some problem with the bbox_pixels #189

Comments

LYXFOREVER commented Nov 23, 2024

crawles commented Nov 23, 2024

LYXFOREVER commented Nov 24, 2024

crawles commented Nov 25, 2024

LYXFOREVER commented Nov 25, 2024