Spaces:

aw1app
/

roboflamingo-demo

Paused

App Files Files Community

aw1app commited on Nov 13

Commit

12158f3

1 Parent(s): 98bbe36

Add pre-computed results viewer

Browse files

Files changed (19) hide show

README.md +3 -36
app.py +63 -202
inference_results/close_drawer.json +148 -0
inference_results/close_drawer_gripper.png +0 -0
inference_results/close_drawer_third.png +0 -0
inference_results/index.json +12 -0
inference_results/open_drawer.json +148 -0
inference_results/open_drawer_gripper.png +0 -0
inference_results/open_drawer_third.png +0 -0
inference_results/pick_red_block.json +148 -0
inference_results/pick_red_block_gripper.png +0 -0
inference_results/pick_red_block_third.png +0 -0
inference_results/place_in_drawer.json +148 -0
inference_results/place_in_drawer_gripper.png +0 -0
inference_results/place_in_drawer_third.png +0 -0
inference_results/slide_block.json +148 -0
inference_results/slide_block_gripper.png +0 -0
inference_results/slide_block_third.png +0 -0
requirements.txt +1 -6

README.md CHANGED Viewed

@@ -7,43 +7,10 @@ sdk: gradio
 sdk_version: 4.8.0
 app_file: app.py
 pinned: false
-license: mit
 ---
-# RoboFlamingo Demo 🤖
-Interactive demo for **RoboFlamingo: Vision-Language Foundation Models as Effective Robot Imitators**
-## About
-RoboFlamingo adapts pre-trained Vision-Language Models for robot manipulation through efficient fine-tuning.
-### Key Features
-- 🎯 SOTA performance on CALVIN (4.09 avg task length)
-- 💪 Frozen OpenFlamingo + trainable policy head
-- ⚡ Single GPU training
-- 🎨 Natural language robot control
-## Architecture
-```
-Images → ViT → Perceiver → Flamingo → Policy → Actions
-       (frozen) (trainable)  (partial) (trainable)
-```
-## Citation
-```bibtex
-@article{li2023vision,
-  title={Vision-Language Foundation Models as Effective Robot Imitators},
-  author={Li, Xinghang and Liu, Minghuan and Zhang, Hanbo and Yu, Cunjun and Xu, Jie and Wu, Hongtao and Cheang, Chilam and Jing, Ya and Zhang, Weinan and Liu, Huaping and Li, Hang and Kong, Tao},
-  journal={arXiv preprint arXiv:2311.01378},
-  year={2023}
-}
-```
-## Resources
-- 📄 [Paper](https://arxiv.org/abs/2311.01378)
-- 💻 [GitHub](https://github.com/RoboFlamingo/RoboFlamingo)
-- 🤗 [Models](https://huggingface.co/roboflamingo)

 sdk_version: 4.8.0
 app_file: app.py
 pinned: false
 ---
+# RoboFlamingo Demo
+Pre-computed inference results from RoboFlamingo model.
+[Paper](https://arxiv.org/abs/2311.01378) | [Code](https://github.com/RoboFlamingo/RoboFlamingo)

app.py CHANGED Viewed

@@ -1,224 +1,85 @@
 import gradio as gr
-import torch
 import numpy as np
 from PIL import Image
 import matplotlib.pyplot as plt
 from io import BytesIO
-# ============================================================================
-# MODEL (Demo Version)
-# ============================================================================
-class RoboFlamingoDemo:
-    """Simplified RoboFlamingo demo for HuggingFace Spaces."""
-    def __init__(self):
-        print("🤖 Initializing RoboFlamingo Demo...")
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"📍 Using device: {self.device}")
-    def predict_actions(self, instruction, third_view_img, gripper_view_img):
-        """Predict robot actions given instruction and images."""
-        window_size = 12  # RoboFlamingo uses 12 timesteps
-        # Simulate 7D pose predictions
-        actions = []
-        for t in range(window_size):
-            progress = t / window_size
-            action = {
-                'timestep': t,
-                'delta_x': 0.05 * progress,
-                'delta_y': 0.02 * progress,
-                'delta_z': -0.03 * (1 - progress),
-                'qw': 0.99,
-                'qx': 0.01,
-                'qy': 0.01,
-                'qz': 0.01,
-            }
-            actions.append(action)
-        # Simulate gripper predictions
-        gripper_commands = [1 if t > window_size // 2 else 0 for t in range(window_size)]
-        return {'actions': actions, 'gripper': gripper_commands, 'instruction': instruction}
-# Initialize model
-demo_model = RoboFlamingoDemo()
-# ============================================================================
-# VISUALIZATION
-# ============================================================================
-def create_trajectory_plot(actions):
-    """Create 3D trajectory visualization."""
-    fig = plt.figure(figsize=(10, 8))
     ax = fig.add_subplot(111, projection='3d')
-    x_positions = [a['delta_x'] for a in actions]
-    y_positions = [a['delta_y'] for a in actions]
-    z_positions = [a['delta_z'] for a in actions]
-    x_cum = np.cumsum(x_positions)
-    y_cum = np.cumsum(y_positions)
-    z_cum = np.cumsum(z_positions)
-    ax.plot(x_cum, y_cum, z_cum, 'b-', linewidth=2, marker='o', markersize=6)
-    ax.scatter([x_cum[0]], [y_cum[0]], [z_cum[0]], c='green', s=100, marker='o', label='Start')
-    ax.scatter([x_cum[-1]], [y_cum[-1]], [z_cum[-1]], c='red', s=100, marker='*', label='End')
-    ax.set_xlabel('X Position (m)')
-    ax.set_ylabel('Y Position (m)')
-    ax.set_zlabel('Z Position (m)')
-    ax.set_title('Predicted End-Effector Trajectory')
-    ax.legend()
-    ax.grid(True)
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
-    buf.seek(0)
-    plt.close()
     return Image.open(buf)
-def create_gripper_timeline(gripper_commands):
-    """Create gripper state timeline."""
-    fig, ax = plt.subplots(figsize=(12, 3))
-    timesteps = list(range(len(gripper_commands)))
-    colors = ['green' if cmd == 0 else 'red' for cmd in gripper_commands]
-    labels = ['OPEN' if cmd == 0 else 'CLOSE' for cmd in gripper_commands]
-    ax.bar(timesteps, [1]*len(timesteps), color=colors, alpha=0.7, edgecolor='black')
-    for i, (t, label) in enumerate(zip(timesteps, labels)):
-        ax.text(t, 0.5, label, ha='center', va='center', fontweight='bold', fontsize=8)
-    ax.set_xlabel('Timestep')
-    ax.set_ylabel('Gripper State')
-    ax.set_title('Predicted Gripper Commands (Green=Open, Red=Close)')
-    ax.set_ylim(0, 1.2)
-    ax.set_xticks(timesteps)
-    ax.grid(True, alpha=0.3)
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
-    buf.seek(0)
-    plt.close()
     return Image.open(buf)
-def format_actions_table(actions):
-    """Format actions as markdown table."""
-    table = "| Timestep | Δx (m) | Δy (m) | Δz (m) | Quaternion (w,x,y,z) |\n"
-    table += "|----------|--------|--------|--------|----------------------|\n"
-    for action in actions:
-        table += f"| {action['timestep']:2d} | "
-        table += f"{action['delta_x']:6.3f} | "
-        table += f"{action['delta_y']:6.3f} | "
-        table += f"{action['delta_z']:6.3f} | "
-        table += f"({action['qw']:.2f}, {action['qx']:.2f}, {action['qy']:.2f}, {action['qz']:.2f}) |\n"
-    return table
-# ============================================================================
-# MAIN PREDICTION FUNCTION
-# ============================================================================
-def predict_robot_actions(instruction, third_view_image, gripper_view_image):
-    """Main prediction function."""
-    if not instruction or instruction.strip() == "":
-        return None, None, "", "❌ Please enter an instruction!"
-    if third_view_image is None:
-        return None, None, "", "❌ Please upload a third-person view image!"
-    if gripper_view_image is None:
-        return None, None, "", "❌ Please upload a gripper view image!"
-    try:
-        if isinstance(third_view_image, np.ndarray):
-            third_view_image = Image.fromarray(third_view_image)
-        if isinstance(gripper_view_image, np.ndarray):
-            gripper_view_image = Image.fromarray(gripper_view_image)
-        results = demo_model.predict_actions(instruction, third_view_image, gripper_view_image)
-        trajectory_plot = create_trajectory_plot(results['actions'])
-        gripper_plot = create_gripper_timeline(results['gripper'])
-        actions_table = format_actions_table(results['actions'])
-        status = f"✅ Successfully predicted actions for: '{instruction}'"
-        return trajectory_plot, gripper_plot, actions_table, status
-    except Exception as e:
-        return None, None, "", f"❌ Error: {str(e)}"
-# ============================================================================
-# GRADIO INTERFACE
-# ============================================================================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🤖 RoboFlamingo Demo
-    ### Vision-Language Foundation Models as Effective Robot Imitators
-    **How to use:**
-    1. Upload third-person camera view
-    2. Upload gripper camera view
-    3. Enter instruction (e.g., "Pick up the red block")
-    4. Click "Predict Actions"
-    ---
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### 📥 Inputs")
-            instruction_input = gr.Textbox(
-                label="Language Instruction",
-                placeholder="e.g., Pick up the red block and put it in the drawer",
-                lines=3
-            )
-            with gr.Row():
-                third_view_input = gr.Image(label="Third-Person View", type="pil", height=250)
-                gripper_view_input = gr.Image(label="Gripper View", type="pil", height=250)
-            predict_button = gr.Button("🚀 Predict Actions", variant="primary", size="lg")
-            status_output = gr.Textbox(label="Status", interactive=False, lines=2)
-        with gr.Column(scale=1):
-            gr.Markdown("### 📊 Predictions")
-            trajectory_output = gr.Image(label="Predicted Trajectory (3D)", type="pil")
-            gripper_output = gr.Image(label="Gripper Commands", type="pil")
     with gr.Row():
-        actions_table_output = gr.Markdown(label="Detailed Actions")
-    predict_button.click(
-        fn=predict_robot_actions,
-        inputs=[instruction_input, third_view_input, gripper_view_input],
-        outputs=[trajectory_output, gripper_output, actions_table_output, status_output]
-    )
-    gr.Markdown("""
-    ---
-    ### 📚 About RoboFlamingo
-    - **Paper:** [arXiv:2311.01378](https://arxiv.org/abs/2311.01378)
-    - **Code:** [GitHub](https://github.com/RoboFlamingo/RoboFlamingo)
-    - **Models:** [HuggingFace](https://huggingface.co/roboflamingo)
-    **Note:** Demo version with simulated predictions. For full model, see GitHub.
-    """)
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import json
 import numpy as np
 from PIL import Image
 import matplotlib.pyplot as plt
 from io import BytesIO
+from pathlib import Path
+RESULTS_DIR = Path("inference_results")
+def load_results():
+    results = {}
+    for f in RESULTS_DIR.glob("*.json"):
+        try:
+            data = json.load(open(f))
+            if 'id' in data and data['id'] != 'index':
+                results[data['id']] = data
+        except: pass
+    return results
+RESULTS = load_results()
+print(f"Loaded {len(RESULTS)} examples")
+def plot_traj(acts):
+    fig = plt.figure(figsize=(10,8))
     ax = fig.add_subplot(111, projection='3d')
+    x = np.cumsum([a['delta_x'] for a in acts])
+    y = np.cumsum([a['delta_y'] for a in acts])
+    z = np.cumsum([a['delta_z'] for a in acts])
+    ax.plot(x, y, z, 'b-', lw=2, marker='o', ms=6)
+    ax.scatter(x[0], y[0], z[0], c='green', s=100, label='Start')
+    ax.scatter(x[-1], y[-1], z[-1], c='red', s=100, label='End')
+    ax.set_xlabel('X'); ax.set_ylabel('Y'); ax.set_zlabel('Z')
+    ax.legend(); ax.grid()
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+    buf.seek(0); plt.close()
     return Image.open(buf)
+def plot_grip(grip):
+    fig, ax = plt.subplots(figsize=(12,3))
+    cols = ['green' if g==0 else 'red' for g in grip]
+    ax.bar(range(len(grip)), [1]*len(grip), color=cols, alpha=0.7, ec='black')
+    for i, g in enumerate(grip):
+        ax.text(i, 0.5, 'OPEN' if g==0 else 'CLOSE', ha='center', va='center', weight='bold')
+    ax.set_xlabel('Timestep'); ax.set_ylim(0,1.2); ax.grid(alpha=0.3)
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+    buf.seek(0); plt.close()
     return Image.open(buf)
+def show(eid):
+    if not RESULTS or eid not in RESULTS:
+        return [None]*6
+    r = RESULTS[eid]
+    img1 = Image.open(RESULTS_DIR / r['third_view_image'])
+    img2 = Image.open(RESULTS_DIR / r['gripper_view_image'])
+    traj = plot_traj(r['actions'])
+    grip = plot_grip(r['gripper'])
+    table = "| T | Δx | Δy | Δz |\n|--|--|--|--|\n"
+    for a in r['actions']:
+        table += f"| {a['timestep']} | {a['delta_x']:.3f} | {a['delta_y']:.3f} | {a['delta_z']:.3f} |\n"
+    status = f"✅ {r['instruction']}\nModel: {r['model_type']}"
+    return img1, img2, traj, grip, table, status
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🤖 RoboFlamingo Demo\nPre-computed results viewer")
+    sel = gr.Dropdown(list(RESULTS.keys()) if RESULTS else ["None"],
+                      label="Example", value=list(RESULTS.keys())[0] if RESULTS else None)
     with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Inputs")
+            i1 = gr.Image(label="Third View", type="pil")
+            i2 = gr.Image(label="Gripper", type="pil")
+            st = gr.Markdown()
+        with gr.Column():
+            gr.Markdown("### Predictions")
+            o1 = gr.Image(label="Trajectory", type="pil")
+            o2 = gr.Image(label="Gripper", type="pil")
+    tab = gr.Markdown()
+    sel.change(show, [sel], [i1, i2, o1, o2, tab, st])
+    demo.load(show, [sel], [i1, i2, o1, o2, tab, st])
+    gr.Markdown("[Paper](https://arxiv.org/abs/2311.01378) | [Code](https://github.com/RoboFlamingo/RoboFlamingo)")
+demo.launch()

inference_results/close_drawer.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+  "id": "close_drawer",
+  "instruction": "Close the drawer",
+  "description": "Push the drawer to close it",
+  "third_view_image": "close_drawer_third.png",
+  "gripper_view_image": "close_drawer_gripper.png",
+  "actions": [
+    {
+      "timestep": 0,
+      "delta_x": 0.0,
+      "delta_y": 0.0,
+      "delta_z": -0.03,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 1,
+      "delta_x": 0.004166666666666667,
+      "delta_y": 0.0016666666666666666,
+      "delta_z": -0.027499999999999997,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 2,
+      "delta_x": 0.008333333333333333,
+      "delta_y": 0.003333333333333333,
+      "delta_z": -0.025,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 3,
+      "delta_x": 0.0125,
+      "delta_y": 0.005,
+      "delta_z": -0.0225,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 4,
+      "delta_x": 0.016666666666666666,
+      "delta_y": 0.006666666666666666,
+      "delta_z": -0.02,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 5,
+      "delta_x": 0.020833333333333336,
+      "delta_y": 0.008333333333333333,
+      "delta_z": -0.017499999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 6,
+      "delta_x": 0.025,
+      "delta_y": 0.01,
+      "delta_z": -0.015,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 7,
+      "delta_x": 0.02916666666666667,
+      "delta_y": 0.011666666666666667,
+      "delta_z": -0.012499999999999999,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 8,
+      "delta_x": 0.03333333333333333,
+      "delta_y": 0.013333333333333332,
+      "delta_z": -0.01,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 9,
+      "delta_x": 0.037500000000000006,
+      "delta_y": 0.015,
+      "delta_z": -0.0075,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 10,
+      "delta_x": 0.04166666666666667,
+      "delta_y": 0.016666666666666666,
+      "delta_z": -0.004999999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 11,
+      "delta_x": 0.04583333333333334,
+      "delta_y": 0.018333333333333333,
+      "delta_z": -0.002500000000000001,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    }
+  ],
+  "gripper": [
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "model_type": "Simulation",
+  "metadata": {
+    "device": "cuda",
+    "num_timesteps": 12
+  }
+}

inference_results/close_drawer_gripper.png ADDED Viewed

inference_results/close_drawer_third.png ADDED Viewed

inference_results/index.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "total_examples": 5,
+  "examples": [
+    "pick_red_block",
+    "open_drawer",
+    "place_in_drawer",
+    "close_drawer",
+    "slide_block"
+  ],
+  "model_type": "Simulation",
+  "generated_at": "1763025901.712671"
+}

inference_results/open_drawer.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+  "id": "open_drawer",
+  "instruction": "Open the drawer",
+  "description": "Pull the drawer handle to open",
+  "third_view_image": "open_drawer_third.png",
+  "gripper_view_image": "open_drawer_gripper.png",
+  "actions": [
+    {
+      "timestep": 0,
+      "delta_x": 0.0,
+      "delta_y": 0.0,
+      "delta_z": -0.03,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 1,
+      "delta_x": 0.004166666666666667,
+      "delta_y": 0.0016666666666666666,
+      "delta_z": -0.027499999999999997,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 2,
+      "delta_x": 0.008333333333333333,
+      "delta_y": 0.003333333333333333,
+      "delta_z": -0.025,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 3,
+      "delta_x": 0.0125,
+      "delta_y": 0.005,
+      "delta_z": -0.0225,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 4,
+      "delta_x": 0.016666666666666666,
+      "delta_y": 0.006666666666666666,
+      "delta_z": -0.02,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 5,
+      "delta_x": 0.020833333333333336,
+      "delta_y": 0.008333333333333333,
+      "delta_z": -0.017499999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 6,
+      "delta_x": 0.025,
+      "delta_y": 0.01,
+      "delta_z": -0.015,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 7,
+      "delta_x": 0.02916666666666667,
+      "delta_y": 0.011666666666666667,
+      "delta_z": -0.012499999999999999,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 8,
+      "delta_x": 0.03333333333333333,
+      "delta_y": 0.013333333333333332,
+      "delta_z": -0.01,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 9,
+      "delta_x": 0.037500000000000006,
+      "delta_y": 0.015,
+      "delta_z": -0.0075,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 10,
+      "delta_x": 0.04166666666666667,
+      "delta_y": 0.016666666666666666,
+      "delta_z": -0.004999999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 11,
+      "delta_x": 0.04583333333333334,
+      "delta_y": 0.018333333333333333,
+      "delta_z": -0.002500000000000001,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    }
+  ],
+  "gripper": [
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "model_type": "Simulation",
+  "metadata": {
+    "device": "cuda",
+    "num_timesteps": 12
+  }
+}

inference_results/open_drawer_gripper.png ADDED Viewed

inference_results/open_drawer_third.png ADDED Viewed

inference_results/pick_red_block.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+  "id": "pick_red_block",
+  "instruction": "Pick up the red block",
+  "description": "Grasp the red block from the table",
+  "third_view_image": "pick_red_block_third.png",
+  "gripper_view_image": "pick_red_block_gripper.png",
+  "actions": [
+    {
+      "timestep": 0,
+      "delta_x": 0.0,
+      "delta_y": 0.0,
+      "delta_z": -0.03,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 1,
+      "delta_x": 0.004166666666666667,
+      "delta_y": 0.0016666666666666666,
+      "delta_z": -0.027499999999999997,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 2,
+      "delta_x": 0.008333333333333333,
+      "delta_y": 0.003333333333333333,
+      "delta_z": -0.025,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 3,
+      "delta_x": 0.0125,
+      "delta_y": 0.005,
+      "delta_z": -0.0225,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 4,
+      "delta_x": 0.016666666666666666,
+      "delta_y": 0.006666666666666666,
+      "delta_z": -0.02,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 5,
+      "delta_x": 0.020833333333333336,
+      "delta_y": 0.008333333333333333,
+      "delta_z": -0.017499999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 6,
+      "delta_x": 0.025,
+      "delta_y": 0.01,
+      "delta_z": -0.015,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 7,
+      "delta_x": 0.02916666666666667,
+      "delta_y": 0.011666666666666667,
+      "delta_z": -0.012499999999999999,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 8,
+      "delta_x": 0.03333333333333333,
+      "delta_y": 0.013333333333333332,
+      "delta_z": -0.01,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 9,
+      "delta_x": 0.037500000000000006,
+      "delta_y": 0.015,
+      "delta_z": -0.0075,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 10,
+      "delta_x": 0.04166666666666667,
+      "delta_y": 0.016666666666666666,
+      "delta_z": -0.004999999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 11,
+      "delta_x": 0.04583333333333334,
+      "delta_y": 0.018333333333333333,
+      "delta_z": -0.002500000000000001,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    }
+  ],
+  "gripper": [
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "model_type": "Simulation",
+  "metadata": {
+    "device": "cuda",
+    "num_timesteps": 12
+  }
+}

inference_results/pick_red_block_gripper.png ADDED Viewed

inference_results/pick_red_block_third.png ADDED Viewed

inference_results/place_in_drawer.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+  "id": "place_in_drawer",
+  "instruction": "Place the red block in the drawer",
+  "description": "Put the grasped object into the open drawer",
+  "third_view_image": "place_in_drawer_third.png",
+  "gripper_view_image": "place_in_drawer_gripper.png",
+  "actions": [
+    {
+      "timestep": 0,
+      "delta_x": 0.0,
+      "delta_y": 0.0,
+      "delta_z": -0.03,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 1,
+      "delta_x": 0.004166666666666667,
+      "delta_y": 0.0016666666666666666,
+      "delta_z": -0.027499999999999997,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 2,
+      "delta_x": 0.008333333333333333,
+      "delta_y": 0.003333333333333333,
+      "delta_z": -0.025,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 3,
+      "delta_x": 0.0125,
+      "delta_y": 0.005,
+      "delta_z": -0.0225,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 4,
+      "delta_x": 0.016666666666666666,
+      "delta_y": 0.006666666666666666,
+      "delta_z": -0.02,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 5,
+      "delta_x": 0.020833333333333336,
+      "delta_y": 0.008333333333333333,
+      "delta_z": -0.017499999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 6,
+      "delta_x": 0.025,
+      "delta_y": 0.01,
+      "delta_z": -0.015,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 7,
+      "delta_x": 0.02916666666666667,
+      "delta_y": 0.011666666666666667,
+      "delta_z": -0.012499999999999999,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 8,
+      "delta_x": 0.03333333333333333,
+      "delta_y": 0.013333333333333332,
+      "delta_z": -0.01,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 9,
+      "delta_x": 0.037500000000000006,
+      "delta_y": 0.015,
+      "delta_z": -0.0075,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 10,
+      "delta_x": 0.04166666666666667,
+      "delta_y": 0.016666666666666666,
+      "delta_z": -0.004999999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 11,
+      "delta_x": 0.04583333333333334,
+      "delta_y": 0.018333333333333333,
+      "delta_z": -0.002500000000000001,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    }
+  ],
+  "gripper": [
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "model_type": "Simulation",
+  "metadata": {
+    "device": "cuda",
+    "num_timesteps": 12
+  }
+}

inference_results/place_in_drawer_gripper.png ADDED Viewed

inference_results/place_in_drawer_third.png ADDED Viewed

inference_results/slide_block.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+  "id": "slide_block",
+  "instruction": "Slide the block to the left",
+  "description": "Push the block horizontally across the table",
+  "third_view_image": "slide_block_third.png",
+  "gripper_view_image": "slide_block_gripper.png",
+  "actions": [
+    {
+      "timestep": 0,
+      "delta_x": 0.0,
+      "delta_y": 0.0,
+      "delta_z": -0.03,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 1,
+      "delta_x": 0.004166666666666667,
+      "delta_y": 0.0016666666666666666,
+      "delta_z": -0.027499999999999997,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 2,
+      "delta_x": 0.008333333333333333,
+      "delta_y": 0.003333333333333333,
+      "delta_z": -0.025,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 3,
+      "delta_x": 0.0125,
+      "delta_y": 0.005,
+      "delta_z": -0.0225,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 4,
+      "delta_x": 0.016666666666666666,
+      "delta_y": 0.006666666666666666,
+      "delta_z": -0.02,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 5,
+      "delta_x": 0.020833333333333336,
+      "delta_y": 0.008333333333333333,
+      "delta_z": -0.017499999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 6,
+      "delta_x": 0.025,
+      "delta_y": 0.01,
+      "delta_z": -0.015,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 7,
+      "delta_x": 0.02916666666666667,
+      "delta_y": 0.011666666666666667,
+      "delta_z": -0.012499999999999999,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 8,
+      "delta_x": 0.03333333333333333,
+      "delta_y": 0.013333333333333332,
+      "delta_z": -0.01,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 9,
+      "delta_x": 0.037500000000000006,
+      "delta_y": 0.015,
+      "delta_z": -0.0075,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 10,
+      "delta_x": 0.04166666666666667,
+      "delta_y": 0.016666666666666666,
+      "delta_z": -0.004999999999999998,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    },
+    {
+      "timestep": 11,
+      "delta_x": 0.04583333333333334,
+      "delta_y": 0.018333333333333333,
+      "delta_z": -0.002500000000000001,
+      "qw": 0.99,
+      "qx": 0.01,
+      "qy": 0.01,
+      "qz": 0.01
+    }
+  ],
+  "gripper": [
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "model_type": "Simulation",
+  "metadata": {
+    "device": "cuda",
+    "num_timesteps": 12
+  }
+}

inference_results/slide_block_gripper.png ADDED Viewed

inference_results/slide_block_third.png ADDED Viewed

requirements.txt CHANGED Viewed

@@ -1,9 +1,4 @@
 gradio==4.8.0
-torch==2.1.0
-torchvision==0.16.0
-transformers==4.35.0
-huggingface_hub>=0.16.4,<1.0
-einops==0.7.0
 numpy==1.24.3
 Pillow==10.1.0
-matplotlib==3.8.0

 gradio==4.8.0
 numpy==1.24.3
 Pillow==10.1.0
+matplotlib==3.8.0