Skip to content

Commit

Permalink
The New Computer Update Part II
Browse files Browse the repository at this point in the history
  • Loading branch information
KillianLucas committed Mar 11, 2024
1 parent aca11f7 commit d9829b5
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 41 deletions.
3 changes: 2 additions & 1 deletion interpreter/core/computer/browser/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def search(self, query):
Searches the web for the specified query and returns the results.
"""
response = requests.get(
f'{self.computer.api_base.strip("/")}/browser/search', params={"q": query}
f'{self.computer.api_base.strip("/")}/browser/search',
params={"query": query},
)
return response.json()["result"]
1 change: 0 additions & 1 deletion interpreter/core/computer/computer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def __init__(self, interpreter):
self.emit_images = True
self.api_base = "https://api.openinterpreter.com/v0"
self.save_skills = True
# self.api_base = "http://0.0.0.0/v0"

self.import_computer_api = True
self._has_imported_computer_api = False # Because we only want to do this once
Expand Down
39 changes: 34 additions & 5 deletions interpreter/core/computer/display/display.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import base64
import os
import platform
import pprint
import time
import warnings
Expand Down Expand Up @@ -64,12 +65,13 @@ def view(self, show=True, quadrant=None):
# def get_active_window(self):
# return get_active_window()

def screenshot(self, show=True, quadrant=None, active_app_only=False):
def screenshot(
self, show=True, quadrant=None, active_app_only=False, force_image=False
):
"""
Shows you what's on the screen by taking a screenshot of the entire screen or a specified quadrant. Returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
"""
time.sleep(2)
if not self.computer.emit_images:
if not self.computer.emit_images and force_image == False:
text = self.get_text_as_list_of_lists()
pp = pprint.PrettyPrinter(indent=4)
pretty_text = pp.pformat(text) # language models like it pretty!
Expand All @@ -89,7 +91,10 @@ def screenshot(self, show=True, quadrant=None, active_app_only=False):
region = self.get_active_window()["region"]
screenshot = pyautogui.screenshot(region=region)
else:
screenshot = pyautogui.screenshot()
if platform.system() == "Darwin":
screenshot = take_screenshot_to_pil()
else:
screenshot = pyautogui.screenshot()
# message = format_to_recipient("Taking a screenshot of the entire screen. This is not recommended. You (the language model assistant) will recieve it with low resolution.\n\nTo maximize performance, use computer.display.view(active_app_only=True). This will produce an ultra high quality image of the active application.", "assistant")
# print(message)

Expand Down Expand Up @@ -139,11 +144,14 @@ def find(self, description, screenshot=None):
print("NUM HASHES:", len(self._hashes))
else:
message = format_to_recipient(
"Locating this icon will take ~10 seconds. Subsequent icons should be found more quickly.",
"Locating this icon will take ~15 seconds. Subsequent icons should be found more quickly.",
recipient="user",
)
print(message)

if len(self._hashes) > 5000:
self._hashes = dict(list(self._hashes.items())[-5000:])

from .point.point import point

result = point(
Expand Down Expand Up @@ -251,3 +259,24 @@ def get_text_as_list_of_lists(self, screenshot=None):
raise Exception(
"Failed to find text locally.\n\nTo find text in order to use the mouse, please make sure you've installed `pytesseract` along with the Tesseract executable (see this Stack Overflow answer for help installing Tesseract: https://stackoverflow.com/questions/50951955/pytesseract-tesseractnotfound-error-tesseract-is-not-installed-or-its-not-i)."
)


import io
import subprocess

from PIL import Image


def take_screenshot_to_pil(filename="temp_screenshot.png"):
# Capture the screenshot and save it to a temporary file
subprocess.run(["screencapture", "-x", filename], check=True)

# Open the image file with PIL
with open(filename, "rb") as f:
image_data = f.read()
image = Image.open(io.BytesIO(image_data))

# Optionally, delete the temporary file if you don't need it after loading
os.remove(filename)

return image
92 changes: 62 additions & 30 deletions interpreter/core/computer/display/point/point.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def point(description, screenshot=None, debug=False, hashes=None):


def find_icon(description, screenshot=None, debug=False, hashes=None):
if debug:
print("STARTING")
if screenshot == None:
image_data = take_screenshot_to_pil()
else:
Expand All @@ -68,6 +70,9 @@ def find_icon(description, screenshot=None, debug=False, hashes=None):

icons_bounding_boxes = get_element_boxes(image_data, debug)

if debug:
print("GOT ICON BOUNDING BOXES")

debug_path = os.path.join(os.path.expanduser("~"), "Desktop", "oi-debug")

if debug:
Expand Down Expand Up @@ -123,8 +128,14 @@ def find_icon(description, screenshot=None, debug=False, hashes=None):

# # Filter out text

if debug:
print("GETTING TEXT")

response = pytesseract_get_text_bounding_boxes(screenshot)

if debug:
print("GOT TEXT, processing it")

if debug:
# Create a draw object
image_data_copy = image_data.copy()
Expand Down Expand Up @@ -416,7 +427,13 @@ def combine_boxes(icons_bounding_boxes):
if "icon" not in description.lower():
description += " icon"

top_icons = image_search(description, icons, hashes)
if debug:
print("FINALLY, SEARCHING")

top_icons = image_search(description, icons, hashes, debug)

if debug:
print("DONE")

coordinates = [t["coordinate"] for t in top_icons]

Expand Down Expand Up @@ -478,7 +495,7 @@ def embed_images(images: List[Image.Image], model, transforms):
model = model.to(device)


def image_search(query, icons, hashes):
def image_search(query, icons, hashes, debug):
hashed_icons = [icon for icon in icons if icon["hash"] in hashes]
unhashed_icons = [icon for icon in icons if icon["hash"] not in hashes]

Expand All @@ -488,7 +505,7 @@ def image_search(query, icons, hashes):
[query] + [icon["data"] for icon in unhashed_icons],
batch_size=128,
convert_to_tensor=True,
show_progress_bar=False,
show_progress_bar=debug,
)
else:
query_and_unhashed_icons_embeds = embed_images(
Expand Down Expand Up @@ -526,9 +543,10 @@ def image_search(query, icons, hashes):


def get_element_boxes(image_data, debug):
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
debug_path = os.path.join(desktop_path, "oi-debug")

if debug:
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
debug_path = os.path.join(desktop_path, "oi-debug")
if not os.path.exists(debug_path):
os.makedirs(debug_path)

Expand Down Expand Up @@ -662,6 +680,9 @@ def process_image(
pil_image, debug=debug, debug_path=debug_path
)

if debug:
print("WE HERE")

# Initialize an empty list to store the boxes
boxes = []
for contour in contours_contrasted:
Expand All @@ -670,30 +691,41 @@ def process_image(
# Append the box as a dictionary to the list
boxes.append({"x": x, "y": y, "width": w, "height": h})

# Remove any boxes whose edges cross over any contours
filtered_boxes = []
for box in boxes:
crosses_contour = False
for contour in contours_contrasted:
if (
cv2.pointPolygonTest(contour, (box["x"], box["y"]), False) >= 0
or cv2.pointPolygonTest(
contour, (box["x"] + box["width"], box["y"]), False
)
>= 0
or cv2.pointPolygonTest(
contour, (box["x"], box["y"] + box["height"]), False
)
>= 0
or cv2.pointPolygonTest(
contour, (box["x"] + box["width"], box["y"] + box["height"]), False
)
>= 0
):
crosses_contour = True
break
if not crosses_contour:
filtered_boxes.append(box)
boxes = filtered_boxes
if debug:
print("WE HHERE")

if (
False
): # Disabled. I thought this would be faster but it's actually slower than just embedding all of them.
# Remove any boxes whose edges cross over any contours
filtered_boxes = []
for box in boxes:
crosses_contour = False
for contour in contours_contrasted:
if (
cv2.pointPolygonTest(contour, (box["x"], box["y"]), False) >= 0
or cv2.pointPolygonTest(
contour, (box["x"] + box["width"], box["y"]), False
)
>= 0
or cv2.pointPolygonTest(
contour, (box["x"], box["y"] + box["height"]), False
)
>= 0
or cv2.pointPolygonTest(
contour,
(box["x"] + box["width"], box["y"] + box["height"]),
False,
)
>= 0
):
crosses_contour = True
break
if not crosses_contour:
filtered_boxes.append(box)
boxes = filtered_boxes

if debug:
print("WE HHHERE")

return boxes
5 changes: 4 additions & 1 deletion interpreter/terminal_interface/profiles/defaults/os.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,13 @@
computer.keyboard.hotkey(" ", "command") # Opens spotlight (very useful)
computer.keyboard.write("hello")
# Use this to click text:
computer.mouse.click("text onscreen") # This clicks on the UI element with that text. Use this **frequently** and get creative! To click a video, you could pass the *timestamp* (which is usually written on the thumbnail) into this.
# Use this to click an icon, button, or other symbol:
computer.mouse.click(icon="gear icon") # Moves mouse to the icon with that description. Use this very often.
computer.mouse.move("open recent >") # This moves the mouse over the UI element with that text. Many dropdowns will disappear if you click them. You have to hover over items to reveal more.
computer.mouse.click(x=500, y=500) # Use this very, very rarely. It's highly inaccurate
computer.mouse.click(icon="gear icon") # Moves mouse to the icon with that description. Use this very often
computer.mouse.scroll(-10) # Scrolls down. If you don't find some text on screen that you expected to be there, you probably want to do this
x, y = computer.display.center() # Get your bearings
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "open-interpreter"
packages = [
{include = "interpreter"},
]
version = "0.2.1-rc2" # Use "-rc1", "-rc2", etc. for pre-release versions
version = "0.2.1" # Use "-rc1", "-rc2", etc. for pre-release versions
description = "Let language models run code"
authors = ["Killian Lucas <killian@openinterpreter.com>"]
readme = "README.md"
Expand Down
14 changes: 12 additions & 2 deletions tests/test_interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@

@pytest.mark.skip(reason="Computer with display only + no way to fail test")
def test_point():
interpreter.offline = True
# interpreter.computer.debug = True
interpreter.computer.mouse.move(icon="gear")
interpreter.computer.mouse.move(icon="refresh")
# interpreter.computer.mouse.move("Spaces:")
interpreter.computer.mouse.move(icon="play")
interpreter.computer.mouse.move(icon="magnifying glass")
interpreter.computer.mouse.move("Spaces:")
assert False


Expand Down Expand Up @@ -62,6 +63,15 @@ def test_skills():
assert "testing_skilsl" in str(output)


@pytest.mark.skip(reason="Local only")
def test_browser():
interpreter.computer.api_base = "http://0.0.0.0:80/v0"
print(
interpreter.computer.browser.search("When's the next Dune showing in Seattle?")
)
assert False


@pytest.mark.skip(reason="Computer with display only + no way to fail test")
def test_display_api():
start = time.time()
Expand Down

0 comments on commit d9829b5

Please sign in to comment.