A-Mahla commited on
Commit
28b1bb6
·
unverified ·
1 Parent(s): 8c205ef

improve document related shortcut (#15)

Browse files
cua2-core/src/cua2_core/services/agent_utils/desktop_agent.py CHANGED
@@ -172,15 +172,15 @@ class E2BVisionAgent(CodeAgent):
172
  return f"Typed text: '{clean_text}'"
173
 
174
  @tool
175
- def press(key: str) -> str:
176
  """
177
  Presses a keyboard key
178
  Args:
179
- key: The key to press (e.g. "enter", "space", "backspace", etc.).
180
  """
181
- self.desktop.press(key)
182
- self.logger.log(f"Pressed key: {key}")
183
- return f"Pressed key: {key}"
184
 
185
  @tool
186
  def go_back() -> str:
 
172
  return f"Typed text: '{clean_text}'"
173
 
174
  @tool
175
+ def press(keys: list[str]) -> str:
176
  """
177
  Presses a keyboard key
178
  Args:
179
+ keys: The keys to press (e.g. ["enter", "space", "backspace", etc.]).
180
  """
181
+ self.desktop.press(keys)
182
+ self.logger.log(f"Pressed keys: {keys}")
183
+ return f"Pressed keys: {keys}"
184
 
185
  @tool
186
  def go_back() -> str:
cua2-core/src/cua2_core/services/agent_utils/prompt.py CHANGED
@@ -97,6 +97,16 @@ Never manually click the browser icon — use `open_url()` directly for web page
97
  - For websites: `open_url("https://google.com")`
98
  - For applications: `launch("app_name")`
99
  - Never manually navigate to apps via clicking icons—use the open tools directly.
 
 
 
 
 
 
 
 
 
 
100
  - Complete one atomic action per step: e.g., **click**, **type**, or **wait**.
101
  - Never combine multiple tool calls in one step.
102
  - Validate that your previous action succeeded before continuing.
@@ -155,13 +165,14 @@ final_answer("The task is complete and the text 'Hello World' is visible in the
155
 
156
  <core_principles>
157
  - Think visually and spatially.
158
- - Always ground your reasoning in whats visible in the screenshot.
159
- - Never assume whats on the next screen.
160
  - Always check the result of your last action.
161
  - Be deliberate, consistent, and patient.
162
  - **ALWAYS START** by analyzing if the task requires opening an application or URL. If so, your **first action** must be:
163
  - For websites: `open_url("https://google.com")`
164
  - For applications: `open("app_name")`
165
  - **NEVER** manually navigate to apps via clicking icons—use the open tools directly.
 
166
  </core_principles>
167
  """.replace("<<current_date>>", datetime.now().strftime("%A, %d-%B-%Y"))
 
97
  - For websites: `open_url("https://google.com")`
98
  - For applications: `launch("app_name")`
99
  - Never manually navigate to apps via clicking icons—use the open tools directly.
100
+ - **For document handling**, prioritize using keyboard shortcuts for common operations instead of clicking menu items:
101
+ - Save document: `press(['ctrl', 's'])`
102
+ - Copy: `press(['ctrl', 'c'])`
103
+ - Paste: `press(['ctrl', 'v'])`
104
+ - Undo: `press(['ctrl', 'z'])`
105
+ - Select all: `press(['ctrl', 'a'])`
106
+ - Find: `press(['ctrl', 'f'])`
107
+ - New document: `press(['ctrl', 'n'])`
108
+ - Open file: `press(['ctrl', 'o'])`
109
+ - These shortcuts are faster, more reliable, and work across most applications.
110
  - Complete one atomic action per step: e.g., **click**, **type**, or **wait**.
111
  - Never combine multiple tool calls in one step.
112
  - Validate that your previous action succeeded before continuing.
 
165
 
166
  <core_principles>
167
  - Think visually and spatially.
168
+ - Always ground your reasoning in what's visible in the screenshot.
169
+ - Never assume what's on the next screen.
170
  - Always check the result of your last action.
171
  - Be deliberate, consistent, and patient.
172
  - **ALWAYS START** by analyzing if the task requires opening an application or URL. If so, your **first action** must be:
173
  - For websites: `open_url("https://google.com")`
174
  - For applications: `open("app_name")`
175
  - **NEVER** manually navigate to apps via clicking icons—use the open tools directly.
176
+
177
  </core_principles>
178
  """.replace("<<current_date>>", datetime.now().strftime("%A, %d-%B-%Y"))