Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[QUESTION] Some problem with the bbox_pixels #189

Open
LYXFOREVER opened this issue Nov 23, 2024 · 4 comments
Open

[QUESTION] Some problem with the bbox_pixels #189

LYXFOREVER opened this issue Nov 23, 2024 · 4 comments

Comments

@LYXFOREVER
Copy link

4

I try to store the history list to make a dataset, but I met a problem

I drew the UI boundaries corresponding to the screenshot onto the image and also plotted the actions. The coordinates of the actions represent the centers of the UI elements. I found that the action coordinates are accurate, but the UI boundaries look a bit off. Even though the coordinates are obtained through the bbox_pixels of the UIElement, why is there such a discrepancy?

@crawles
Copy link
Collaborator

crawles commented Nov 23, 2024

Is it possible it is a plotting error? Can you share your code you used to plot this?

@LYXFOREVER
Copy link
Author

I store the history as .pkl file in minimal_task_runner.py:

with open(path, 'wb') as file:
pickle.dump(agent.history, file)

Then in another read_history.py file I read the .pkl file:

def _plot_dual_point(
touch_x,
touch_y,
ax,
):
"""Plots a dual point action on the given matplotlib axis."""
print("本次是点击动作,点击坐标",touch_x,touch_y)
ax.scatter(
touch_x,
touch_y,
s=550,
linewidths=5,
color=_ACTION_COLOR,
marker='+',
)
return ax

def _get_annotation_positions(
element_list
):
"""Processes the annotation positions into distinct bounding boxes.

Args:
    element_list:由UIElement对象组成的列表

Returns:
    A matrix of annotation positions with dimensions (# of annotations, 4),
    where each annotation bounding box takes the form (y, x, h, w).
"""
positions = []

for ui in element_list:
    # 记得筛选掉看不见的ui
    if ui.bbox_pixels is not None and ui.is_visible:
        #首先要获取左下角的那个点的位置
        x = ui.bbox_pixels.x_min
        y = ui.bbox_pixels.y_max
        h = ui.bbox_pixels.y_max - ui.bbox_pixels.y_min
        w = ui.bbox_pixels.x_max - ui.bbox_pixels.x_min
        positions.append((y,x,h,w))

return positions

def _add_text(
text, screen_width, screen_height, ax
):
"""Plots text on the given matplotlib axis."""
t = ax.text(
0.5 * screen_width,
0.95 * screen_height,
text,
color='white',
size=20,
horizontalalignment='center',
verticalalignment='center',
)
t.set_bbox(dict(facecolor=_ACTION_COLOR, alpha=0.9))

def _plot_action(
action, # 通过action.action_type来获取操作类型
screen_height,
screen_width,
ax,
bbox = None
):
"""Plots the example's action on the given matplotlib axis."""
if action.action_type == 'click':
print(bbox)
x,y = bbox.center
return _plot_dual_point(
x, y, ax
)
elif action.action_type == 'input_text':
text = action.text
_add_text(text, screen_width, screen_height, ax)
print(bbox)
x,y = bbox.center
return _plot_dual_point(
x, y, ax
)
else:
pass # 其他的暂时不画

def show_anno_and_action_img():
pkl_name = 'raw_history_pkl/2024_11_22_15_45_19_history.pkl'
with open(pkl_name, 'rb') as f:
history = pickle.load(f)

par_doc_name = 'processed_history/'
current_time = datetime.now()# get current time
formatted_time = current_time.strftime("%Y_%m_%d_%H_%M_%S")
doc_name = par_doc_name + formatted_time

# 创建本个pkl文件的文件夹
folder_path = Path(doc_name)
if not folder_path.exists():
    folder_path.mkdir(parents=True)  # create doc
    print(f"文件夹 '{folder_path}' 已创建")
else:
    print(f"文件夹 '{folder_path}' 已存在")

for i, step in enumerate(history):
    # 确认本次操作的主角
    image = step['before_screenshot']
    image_height = image.shape[0]
    image_width = image.shape[1]

    # 设置画布
    #print('设置画布')
    _, ax = plt.subplots(figsize=(8, 8))
    #print('设置中')
    ax.imshow(image)
    #print('设置ok')

    # 画上动作  首先是取出动作
    action_output = step['action_output']
    _, action = m3a_utils.parse_reason_action_output(action_output)
    try:
        converted_action = json_action.JSONAction(
            **agent_utils.extract_json(action),
        )
    except Exception as e:  # pylint: disable=broad-exception-caught
        print('Failed to convert the output to a valid action.')
        print(str(e))

    # 把动作画到图片里
    if converted_action.action_type == 'click' or converted_action.action_type == 'input_text':
        print("本次操作element:",step['before_element_list'][converted_action.index])
        bbox = step['before_element_list'][converted_action.index].bbox_pixels
        _plot_action(converted_action,image_height,image_width,ax,bbox)
    else:
        _plot_action(converted_action,image_height,image_width,ax)

    # 画出ui边界 首先是得到ui边界
    positions = _get_annotation_positions(step['before_element_list'])
    # 画上去
    for y, x, h, w in positions:
        rect = patches.Rectangle(
            (x, y), w, h, linewidth=1, edgecolor='r', facecolor='none'
        )
        ax.add_patch(rect)

    # 最后得到处理后的image
    plt.tight_layout()
    img_path = doc_name + '/' + str(i) + '.png'
    plt.savefig(img_path)

show_anno_and_action_img()

I am a beginner, and my code is a bit messy. I hope you can excuse me!

I followed the method in AITW to draw the bounding boxes and actions on the app screenshots. These functions seem to work fine on the AITW dataset.

@crawles
Copy link
Collaborator

crawles commented Nov 25, 2024

NP. I am unable to figure out exactly from your code. We have some plotting code that may help you.

from android_world.utils import plotting
from android_world.env import interface

checkpointer = checkpointer_lib.IncrementalCheckpointer(YOUR_PATH)
episodes = checkpointer.load()
episode = episodes[0]
with open('/tmp/result.html', 'w') as f:
  # T3A.
  # data = m3a_utils.generate_single_task_html_for_gpt4_text(episode)
  # M3A.
  data = m3a_utils.generate_single_task_html_for_m3a(episode)

  f.write(data)
from android_world.utils import plotting
from android_world.env import interface

checkpointer = checkpointer_lib.IncrementalCheckpointer(YOUR_PATH)
episodes = checkpointer.load()
episode = episodes[0]
print(episode['episode_data'].keys())
screenshot = episode['episode_data']['before_screenshot'][0]
ui_elements = episode['episode_data']['before_element_list'][0]

state = interface.State(pixels=screenshot, ui_elements=ui_elements, forest=None)

axs = plotting.plot_ui_elements(state)

@LYXFOREVER
Copy link
Author

NP. I am unable to figure out exactly from your code. We have some plotting code that may help you.

from android_world.utils import plotting from android_world.env import interface

checkpointer = checkpointer_lib.IncrementalCheckpointer(YOUR_PATH)
episodes = checkpointer.load()
episode = episodes[0]
with open('/tmp/result.html', 'w') as f:
  # T3A.
  # data = m3a_utils.generate_single_task_html_for_gpt4_text(episode)
  # M3A.
  data = m3a_utils.generate_single_task_html_for_m3a(episode)

  f.write(data)
from android_world.utils import plotting
from android_world.env import interface

checkpointer = checkpointer_lib.IncrementalCheckpointer(YOUR_PATH)
episodes = checkpointer.load()
episode = episodes[0]
print(episode['episode_data'].keys())
screenshot = episode['episode_data']['before_screenshot'][0]
ui_elements = episode['episode_data']['before_element_list'][0]

state = interface.State(pixels=screenshot, ui_elements=ui_elements, forest=None)

axs = plotting.plot_ui_elements(state)

May I ask what is checkpointer_lib?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants