aw1app commited on
Commit
12158f3
·
1 Parent(s): 98bbe36

Add pre-computed results viewer

Browse files
README.md CHANGED
@@ -7,43 +7,10 @@ sdk: gradio
7
  sdk_version: 4.8.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
- # RoboFlamingo Demo 🤖
14
 
15
- Interactive demo for **RoboFlamingo: Vision-Language Foundation Models as Effective Robot Imitators**
16
 
17
- ## About
18
-
19
- RoboFlamingo adapts pre-trained Vision-Language Models for robot manipulation through efficient fine-tuning.
20
-
21
- ### Key Features
22
- - 🎯 SOTA performance on CALVIN (4.09 avg task length)
23
- - 💪 Frozen OpenFlamingo + trainable policy head
24
- - ⚡ Single GPU training
25
- - 🎨 Natural language robot control
26
-
27
- ## Architecture
28
-
29
- ```
30
- Images → ViT → Perceiver → Flamingo → Policy → Actions
31
- (frozen) (trainable) (partial) (trainable)
32
- ```
33
-
34
- ## Citation
35
-
36
- ```bibtex
37
- @article{li2023vision,
38
- title={Vision-Language Foundation Models as Effective Robot Imitators},
39
- author={Li, Xinghang and Liu, Minghuan and Zhang, Hanbo and Yu, Cunjun and Xu, Jie and Wu, Hongtao and Cheang, Chilam and Jing, Ya and Zhang, Weinan and Liu, Huaping and Li, Hang and Kong, Tao},
40
- journal={arXiv preprint arXiv:2311.01378},
41
- year={2023}
42
- }
43
- ```
44
-
45
- ## Resources
46
-
47
- - 📄 [Paper](https://arxiv.org/abs/2311.01378)
48
- - 💻 [GitHub](https://github.com/RoboFlamingo/RoboFlamingo)
49
- - 🤗 [Models](https://huggingface.co/roboflamingo)
 
7
  sdk_version: 4.8.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
+ # RoboFlamingo Demo
13
 
14
+ Pre-computed inference results from RoboFlamingo model.
15
 
16
+ [Paper](https://arxiv.org/abs/2311.01378) | [Code](https://github.com/RoboFlamingo/RoboFlamingo)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,224 +1,85 @@
1
  import gradio as gr
2
- import torch
3
  import numpy as np
4
  from PIL import Image
5
  import matplotlib.pyplot as plt
6
  from io import BytesIO
 
7
 
8
- # ============================================================================
9
- # MODEL (Demo Version)
10
- # ============================================================================
11
 
12
- class RoboFlamingoDemo:
13
- """Simplified RoboFlamingo demo for HuggingFace Spaces."""
14
-
15
- def __init__(self):
16
- print("🤖 Initializing RoboFlamingo Demo...")
17
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
18
- print(f"📍 Using device: {self.device}")
19
-
20
- def predict_actions(self, instruction, third_view_img, gripper_view_img):
21
- """Predict robot actions given instruction and images."""
22
-
23
- window_size = 12 # RoboFlamingo uses 12 timesteps
24
-
25
- # Simulate 7D pose predictions
26
- actions = []
27
- for t in range(window_size):
28
- progress = t / window_size
29
- action = {
30
- 'timestep': t,
31
- 'delta_x': 0.05 * progress,
32
- 'delta_y': 0.02 * progress,
33
- 'delta_z': -0.03 * (1 - progress),
34
- 'qw': 0.99,
35
- 'qx': 0.01,
36
- 'qy': 0.01,
37
- 'qz': 0.01,
38
- }
39
- actions.append(action)
40
-
41
- # Simulate gripper predictions
42
- gripper_commands = [1 if t > window_size // 2 else 0 for t in range(window_size)]
43
-
44
- return {'actions': actions, 'gripper': gripper_commands, 'instruction': instruction}
45
 
46
- # Initialize model
47
- demo_model = RoboFlamingoDemo()
48
 
49
- # ============================================================================
50
- # VISUALIZATION
51
- # ============================================================================
52
-
53
- def create_trajectory_plot(actions):
54
- """Create 3D trajectory visualization."""
55
- fig = plt.figure(figsize=(10, 8))
56
  ax = fig.add_subplot(111, projection='3d')
57
-
58
- x_positions = [a['delta_x'] for a in actions]
59
- y_positions = [a['delta_y'] for a in actions]
60
- z_positions = [a['delta_z'] for a in actions]
61
-
62
- x_cum = np.cumsum(x_positions)
63
- y_cum = np.cumsum(y_positions)
64
- z_cum = np.cumsum(z_positions)
65
-
66
- ax.plot(x_cum, y_cum, z_cum, 'b-', linewidth=2, marker='o', markersize=6)
67
- ax.scatter([x_cum[0]], [y_cum[0]], [z_cum[0]], c='green', s=100, marker='o', label='Start')
68
- ax.scatter([x_cum[-1]], [y_cum[-1]], [z_cum[-1]], c='red', s=100, marker='*', label='End')
69
-
70
- ax.set_xlabel('X Position (m)')
71
- ax.set_ylabel('Y Position (m)')
72
- ax.set_zlabel('Z Position (m)')
73
- ax.set_title('Predicted End-Effector Trajectory')
74
- ax.legend()
75
- ax.grid(True)
76
-
77
  buf = BytesIO()
78
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
79
- buf.seek(0)
80
- plt.close()
81
-
82
  return Image.open(buf)
83
 
84
- def create_gripper_timeline(gripper_commands):
85
- """Create gripper state timeline."""
86
- fig, ax = plt.subplots(figsize=(12, 3))
87
-
88
- timesteps = list(range(len(gripper_commands)))
89
- colors = ['green' if cmd == 0 else 'red' for cmd in gripper_commands]
90
- labels = ['OPEN' if cmd == 0 else 'CLOSE' for cmd in gripper_commands]
91
-
92
- ax.bar(timesteps, [1]*len(timesteps), color=colors, alpha=0.7, edgecolor='black')
93
-
94
- for i, (t, label) in enumerate(zip(timesteps, labels)):
95
- ax.text(t, 0.5, label, ha='center', va='center', fontweight='bold', fontsize=8)
96
-
97
- ax.set_xlabel('Timestep')
98
- ax.set_ylabel('Gripper State')
99
- ax.set_title('Predicted Gripper Commands (Green=Open, Red=Close)')
100
- ax.set_ylim(0, 1.2)
101
- ax.set_xticks(timesteps)
102
- ax.grid(True, alpha=0.3)
103
-
104
  buf = BytesIO()
105
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
106
- buf.seek(0)
107
- plt.close()
108
-
109
  return Image.open(buf)
110
 
111
- def format_actions_table(actions):
112
- """Format actions as markdown table."""
113
- table = "| Timestep | Δx (m) | Δy (m) | Δz (m) | Quaternion (w,x,y,z) |\n"
114
- table += "|----------|--------|--------|--------|----------------------|\n"
115
-
116
- for action in actions:
117
- table += f"| {action['timestep']:2d} | "
118
- table += f"{action['delta_x']:6.3f} | "
119
- table += f"{action['delta_y']:6.3f} | "
120
- table += f"{action['delta_z']:6.3f} | "
121
- table += f"({action['qw']:.2f}, {action['qx']:.2f}, {action['qy']:.2f}, {action['qz']:.2f}) |\n"
122
-
123
- return table
124
-
125
- # ============================================================================
126
- # MAIN PREDICTION FUNCTION
127
- # ============================================================================
128
-
129
- def predict_robot_actions(instruction, third_view_image, gripper_view_image):
130
- """Main prediction function."""
131
-
132
- if not instruction or instruction.strip() == "":
133
- return None, None, "", "❌ Please enter an instruction!"
134
-
135
- if third_view_image is None:
136
- return None, None, "", "❌ Please upload a third-person view image!"
137
-
138
- if gripper_view_image is None:
139
- return None, None, "", "❌ Please upload a gripper view image!"
140
-
141
- try:
142
- if isinstance(third_view_image, np.ndarray):
143
- third_view_image = Image.fromarray(third_view_image)
144
- if isinstance(gripper_view_image, np.ndarray):
145
- gripper_view_image = Image.fromarray(gripper_view_image)
146
-
147
- results = demo_model.predict_actions(instruction, third_view_image, gripper_view_image)
148
-
149
- trajectory_plot = create_trajectory_plot(results['actions'])
150
- gripper_plot = create_gripper_timeline(results['gripper'])
151
- actions_table = format_actions_table(results['actions'])
152
-
153
- status = f"✅ Successfully predicted actions for: '{instruction}'"
154
-
155
- return trajectory_plot, gripper_plot, actions_table, status
156
-
157
- except Exception as e:
158
- return None, None, "", f"❌ Error: {str(e)}"
159
-
160
- # ============================================================================
161
- # GRADIO INTERFACE
162
- # ============================================================================
163
 
164
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
165
-
166
- gr.Markdown("""
167
- # 🤖 RoboFlamingo Demo
168
-
169
- ### Vision-Language Foundation Models as Effective Robot Imitators
170
-
171
- **How to use:**
172
- 1. Upload third-person camera view
173
- 2. Upload gripper camera view
174
- 3. Enter instruction (e.g., "Pick up the red block")
175
- 4. Click "Predict Actions"
176
-
177
- ---
178
- """)
179
-
180
- with gr.Row():
181
- with gr.Column(scale=1):
182
- gr.Markdown("### 📥 Inputs")
183
-
184
- instruction_input = gr.Textbox(
185
- label="Language Instruction",
186
- placeholder="e.g., Pick up the red block and put it in the drawer",
187
- lines=3
188
- )
189
-
190
- with gr.Row():
191
- third_view_input = gr.Image(label="Third-Person View", type="pil", height=250)
192
- gripper_view_input = gr.Image(label="Gripper View", type="pil", height=250)
193
-
194
- predict_button = gr.Button("🚀 Predict Actions", variant="primary", size="lg")
195
- status_output = gr.Textbox(label="Status", interactive=False, lines=2)
196
-
197
- with gr.Column(scale=1):
198
- gr.Markdown("### 📊 Predictions")
199
- trajectory_output = gr.Image(label="Predicted Trajectory (3D)", type="pil")
200
- gripper_output = gr.Image(label="Gripper Commands", type="pil")
201
-
202
  with gr.Row():
203
- actions_table_output = gr.Markdown(label="Detailed Actions")
204
-
205
- predict_button.click(
206
- fn=predict_robot_actions,
207
- inputs=[instruction_input, third_view_input, gripper_view_input],
208
- outputs=[trajectory_output, gripper_output, actions_table_output, status_output]
209
- )
210
-
211
- gr.Markdown("""
212
- ---
213
-
214
- ### 📚 About RoboFlamingo
215
-
216
- - **Paper:** [arXiv:2311.01378](https://arxiv.org/abs/2311.01378)
217
- - **Code:** [GitHub](https://github.com/RoboFlamingo/RoboFlamingo)
218
- - **Models:** [HuggingFace](https://huggingface.co/roboflamingo)
219
-
220
- **Note:** Demo version with simulated predictions. For full model, see GitHub.
221
- """)
222
 
223
- if __name__ == "__main__":
224
- demo.launch()
 
1
  import gradio as gr
2
+ import json
3
  import numpy as np
4
  from PIL import Image
5
  import matplotlib.pyplot as plt
6
  from io import BytesIO
7
+ from pathlib import Path
8
 
9
+ RESULTS_DIR = Path("inference_results")
 
 
10
 
11
+ def load_results():
12
+ results = {}
13
+ for f in RESULTS_DIR.glob("*.json"):
14
+ try:
15
+ data = json.load(open(f))
16
+ if 'id' in data and data['id'] != 'index':
17
+ results[data['id']] = data
18
+ except: pass
19
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ RESULTS = load_results()
22
+ print(f"Loaded {len(RESULTS)} examples")
23
 
24
+ def plot_traj(acts):
25
+ fig = plt.figure(figsize=(10,8))
 
 
 
 
 
26
  ax = fig.add_subplot(111, projection='3d')
27
+ x = np.cumsum([a['delta_x'] for a in acts])
28
+ y = np.cumsum([a['delta_y'] for a in acts])
29
+ z = np.cumsum([a['delta_z'] for a in acts])
30
+ ax.plot(x, y, z, 'b-', lw=2, marker='o', ms=6)
31
+ ax.scatter(x[0], y[0], z[0], c='green', s=100, label='Start')
32
+ ax.scatter(x[-1], y[-1], z[-1], c='red', s=100, label='End')
33
+ ax.set_xlabel('X'); ax.set_ylabel('Y'); ax.set_zlabel('Z')
34
+ ax.legend(); ax.grid()
 
 
 
 
 
 
 
 
 
 
 
 
35
  buf = BytesIO()
36
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
37
+ buf.seek(0); plt.close()
 
 
38
  return Image.open(buf)
39
 
40
+ def plot_grip(grip):
41
+ fig, ax = plt.subplots(figsize=(12,3))
42
+ cols = ['green' if g==0 else 'red' for g in grip]
43
+ ax.bar(range(len(grip)), [1]*len(grip), color=cols, alpha=0.7, ec='black')
44
+ for i, g in enumerate(grip):
45
+ ax.text(i, 0.5, 'OPEN' if g==0 else 'CLOSE', ha='center', va='center', weight='bold')
46
+ ax.set_xlabel('Timestep'); ax.set_ylim(0,1.2); ax.grid(alpha=0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  buf = BytesIO()
48
  plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
49
+ buf.seek(0); plt.close()
 
 
50
  return Image.open(buf)
51
 
52
+ def show(eid):
53
+ if not RESULTS or eid not in RESULTS:
54
+ return [None]*6
55
+ r = RESULTS[eid]
56
+ img1 = Image.open(RESULTS_DIR / r['third_view_image'])
57
+ img2 = Image.open(RESULTS_DIR / r['gripper_view_image'])
58
+ traj = plot_traj(r['actions'])
59
+ grip = plot_grip(r['gripper'])
60
+ table = "| T | Δx | Δy | Δz |\n|--|--|--|--|\n"
61
+ for a in r['actions']:
62
+ table += f"| {a['timestep']} | {a['delta_x']:.3f} | {a['delta_y']:.3f} | {a['delta_z']:.3f} |\n"
63
+ status = f"✅ {r['instruction']}\nModel: {r['model_type']}"
64
+ return img1, img2, traj, grip, table, status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
67
+ gr.Markdown("# 🤖 RoboFlamingo Demo\nPre-computed results viewer")
68
+ sel = gr.Dropdown(list(RESULTS.keys()) if RESULTS else ["None"],
69
+ label="Example", value=list(RESULTS.keys())[0] if RESULTS else None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  with gr.Row():
71
+ with gr.Column():
72
+ gr.Markdown("### Inputs")
73
+ i1 = gr.Image(label="Third View", type="pil")
74
+ i2 = gr.Image(label="Gripper", type="pil")
75
+ st = gr.Markdown()
76
+ with gr.Column():
77
+ gr.Markdown("### Predictions")
78
+ o1 = gr.Image(label="Trajectory", type="pil")
79
+ o2 = gr.Image(label="Gripper", type="pil")
80
+ tab = gr.Markdown()
81
+ sel.change(show, [sel], [i1, i2, o1, o2, tab, st])
82
+ demo.load(show, [sel], [i1, i2, o1, o2, tab, st])
83
+ gr.Markdown("[Paper](https://arxiv.org/abs/2311.01378) | [Code](https://github.com/RoboFlamingo/RoboFlamingo)")
 
 
 
 
 
 
84
 
85
+ demo.launch()
 
inference_results/close_drawer.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "close_drawer",
3
+ "instruction": "Close the drawer",
4
+ "description": "Push the drawer to close it",
5
+ "third_view_image": "close_drawer_third.png",
6
+ "gripper_view_image": "close_drawer_gripper.png",
7
+ "actions": [
8
+ {
9
+ "timestep": 0,
10
+ "delta_x": 0.0,
11
+ "delta_y": 0.0,
12
+ "delta_z": -0.03,
13
+ "qw": 0.99,
14
+ "qx": 0.01,
15
+ "qy": 0.01,
16
+ "qz": 0.01
17
+ },
18
+ {
19
+ "timestep": 1,
20
+ "delta_x": 0.004166666666666667,
21
+ "delta_y": 0.0016666666666666666,
22
+ "delta_z": -0.027499999999999997,
23
+ "qw": 0.99,
24
+ "qx": 0.01,
25
+ "qy": 0.01,
26
+ "qz": 0.01
27
+ },
28
+ {
29
+ "timestep": 2,
30
+ "delta_x": 0.008333333333333333,
31
+ "delta_y": 0.003333333333333333,
32
+ "delta_z": -0.025,
33
+ "qw": 0.99,
34
+ "qx": 0.01,
35
+ "qy": 0.01,
36
+ "qz": 0.01
37
+ },
38
+ {
39
+ "timestep": 3,
40
+ "delta_x": 0.0125,
41
+ "delta_y": 0.005,
42
+ "delta_z": -0.0225,
43
+ "qw": 0.99,
44
+ "qx": 0.01,
45
+ "qy": 0.01,
46
+ "qz": 0.01
47
+ },
48
+ {
49
+ "timestep": 4,
50
+ "delta_x": 0.016666666666666666,
51
+ "delta_y": 0.006666666666666666,
52
+ "delta_z": -0.02,
53
+ "qw": 0.99,
54
+ "qx": 0.01,
55
+ "qy": 0.01,
56
+ "qz": 0.01
57
+ },
58
+ {
59
+ "timestep": 5,
60
+ "delta_x": 0.020833333333333336,
61
+ "delta_y": 0.008333333333333333,
62
+ "delta_z": -0.017499999999999998,
63
+ "qw": 0.99,
64
+ "qx": 0.01,
65
+ "qy": 0.01,
66
+ "qz": 0.01
67
+ },
68
+ {
69
+ "timestep": 6,
70
+ "delta_x": 0.025,
71
+ "delta_y": 0.01,
72
+ "delta_z": -0.015,
73
+ "qw": 0.99,
74
+ "qx": 0.01,
75
+ "qy": 0.01,
76
+ "qz": 0.01
77
+ },
78
+ {
79
+ "timestep": 7,
80
+ "delta_x": 0.02916666666666667,
81
+ "delta_y": 0.011666666666666667,
82
+ "delta_z": -0.012499999999999999,
83
+ "qw": 0.99,
84
+ "qx": 0.01,
85
+ "qy": 0.01,
86
+ "qz": 0.01
87
+ },
88
+ {
89
+ "timestep": 8,
90
+ "delta_x": 0.03333333333333333,
91
+ "delta_y": 0.013333333333333332,
92
+ "delta_z": -0.01,
93
+ "qw": 0.99,
94
+ "qx": 0.01,
95
+ "qy": 0.01,
96
+ "qz": 0.01
97
+ },
98
+ {
99
+ "timestep": 9,
100
+ "delta_x": 0.037500000000000006,
101
+ "delta_y": 0.015,
102
+ "delta_z": -0.0075,
103
+ "qw": 0.99,
104
+ "qx": 0.01,
105
+ "qy": 0.01,
106
+ "qz": 0.01
107
+ },
108
+ {
109
+ "timestep": 10,
110
+ "delta_x": 0.04166666666666667,
111
+ "delta_y": 0.016666666666666666,
112
+ "delta_z": -0.004999999999999998,
113
+ "qw": 0.99,
114
+ "qx": 0.01,
115
+ "qy": 0.01,
116
+ "qz": 0.01
117
+ },
118
+ {
119
+ "timestep": 11,
120
+ "delta_x": 0.04583333333333334,
121
+ "delta_y": 0.018333333333333333,
122
+ "delta_z": -0.002500000000000001,
123
+ "qw": 0.99,
124
+ "qx": 0.01,
125
+ "qy": 0.01,
126
+ "qz": 0.01
127
+ }
128
+ ],
129
+ "gripper": [
130
+ 0,
131
+ 0,
132
+ 0,
133
+ 0,
134
+ 0,
135
+ 0,
136
+ 1,
137
+ 1,
138
+ 1,
139
+ 1,
140
+ 1,
141
+ 1
142
+ ],
143
+ "model_type": "Simulation",
144
+ "metadata": {
145
+ "device": "cuda",
146
+ "num_timesteps": 12
147
+ }
148
+ }
inference_results/close_drawer_gripper.png ADDED
inference_results/close_drawer_third.png ADDED
inference_results/index.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_examples": 5,
3
+ "examples": [
4
+ "pick_red_block",
5
+ "open_drawer",
6
+ "place_in_drawer",
7
+ "close_drawer",
8
+ "slide_block"
9
+ ],
10
+ "model_type": "Simulation",
11
+ "generated_at": "1763025901.712671"
12
+ }
inference_results/open_drawer.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "open_drawer",
3
+ "instruction": "Open the drawer",
4
+ "description": "Pull the drawer handle to open",
5
+ "third_view_image": "open_drawer_third.png",
6
+ "gripper_view_image": "open_drawer_gripper.png",
7
+ "actions": [
8
+ {
9
+ "timestep": 0,
10
+ "delta_x": 0.0,
11
+ "delta_y": 0.0,
12
+ "delta_z": -0.03,
13
+ "qw": 0.99,
14
+ "qx": 0.01,
15
+ "qy": 0.01,
16
+ "qz": 0.01
17
+ },
18
+ {
19
+ "timestep": 1,
20
+ "delta_x": 0.004166666666666667,
21
+ "delta_y": 0.0016666666666666666,
22
+ "delta_z": -0.027499999999999997,
23
+ "qw": 0.99,
24
+ "qx": 0.01,
25
+ "qy": 0.01,
26
+ "qz": 0.01
27
+ },
28
+ {
29
+ "timestep": 2,
30
+ "delta_x": 0.008333333333333333,
31
+ "delta_y": 0.003333333333333333,
32
+ "delta_z": -0.025,
33
+ "qw": 0.99,
34
+ "qx": 0.01,
35
+ "qy": 0.01,
36
+ "qz": 0.01
37
+ },
38
+ {
39
+ "timestep": 3,
40
+ "delta_x": 0.0125,
41
+ "delta_y": 0.005,
42
+ "delta_z": -0.0225,
43
+ "qw": 0.99,
44
+ "qx": 0.01,
45
+ "qy": 0.01,
46
+ "qz": 0.01
47
+ },
48
+ {
49
+ "timestep": 4,
50
+ "delta_x": 0.016666666666666666,
51
+ "delta_y": 0.006666666666666666,
52
+ "delta_z": -0.02,
53
+ "qw": 0.99,
54
+ "qx": 0.01,
55
+ "qy": 0.01,
56
+ "qz": 0.01
57
+ },
58
+ {
59
+ "timestep": 5,
60
+ "delta_x": 0.020833333333333336,
61
+ "delta_y": 0.008333333333333333,
62
+ "delta_z": -0.017499999999999998,
63
+ "qw": 0.99,
64
+ "qx": 0.01,
65
+ "qy": 0.01,
66
+ "qz": 0.01
67
+ },
68
+ {
69
+ "timestep": 6,
70
+ "delta_x": 0.025,
71
+ "delta_y": 0.01,
72
+ "delta_z": -0.015,
73
+ "qw": 0.99,
74
+ "qx": 0.01,
75
+ "qy": 0.01,
76
+ "qz": 0.01
77
+ },
78
+ {
79
+ "timestep": 7,
80
+ "delta_x": 0.02916666666666667,
81
+ "delta_y": 0.011666666666666667,
82
+ "delta_z": -0.012499999999999999,
83
+ "qw": 0.99,
84
+ "qx": 0.01,
85
+ "qy": 0.01,
86
+ "qz": 0.01
87
+ },
88
+ {
89
+ "timestep": 8,
90
+ "delta_x": 0.03333333333333333,
91
+ "delta_y": 0.013333333333333332,
92
+ "delta_z": -0.01,
93
+ "qw": 0.99,
94
+ "qx": 0.01,
95
+ "qy": 0.01,
96
+ "qz": 0.01
97
+ },
98
+ {
99
+ "timestep": 9,
100
+ "delta_x": 0.037500000000000006,
101
+ "delta_y": 0.015,
102
+ "delta_z": -0.0075,
103
+ "qw": 0.99,
104
+ "qx": 0.01,
105
+ "qy": 0.01,
106
+ "qz": 0.01
107
+ },
108
+ {
109
+ "timestep": 10,
110
+ "delta_x": 0.04166666666666667,
111
+ "delta_y": 0.016666666666666666,
112
+ "delta_z": -0.004999999999999998,
113
+ "qw": 0.99,
114
+ "qx": 0.01,
115
+ "qy": 0.01,
116
+ "qz": 0.01
117
+ },
118
+ {
119
+ "timestep": 11,
120
+ "delta_x": 0.04583333333333334,
121
+ "delta_y": 0.018333333333333333,
122
+ "delta_z": -0.002500000000000001,
123
+ "qw": 0.99,
124
+ "qx": 0.01,
125
+ "qy": 0.01,
126
+ "qz": 0.01
127
+ }
128
+ ],
129
+ "gripper": [
130
+ 0,
131
+ 0,
132
+ 0,
133
+ 0,
134
+ 0,
135
+ 0,
136
+ 1,
137
+ 1,
138
+ 1,
139
+ 1,
140
+ 1,
141
+ 1
142
+ ],
143
+ "model_type": "Simulation",
144
+ "metadata": {
145
+ "device": "cuda",
146
+ "num_timesteps": 12
147
+ }
148
+ }
inference_results/open_drawer_gripper.png ADDED
inference_results/open_drawer_third.png ADDED
inference_results/pick_red_block.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "pick_red_block",
3
+ "instruction": "Pick up the red block",
4
+ "description": "Grasp the red block from the table",
5
+ "third_view_image": "pick_red_block_third.png",
6
+ "gripper_view_image": "pick_red_block_gripper.png",
7
+ "actions": [
8
+ {
9
+ "timestep": 0,
10
+ "delta_x": 0.0,
11
+ "delta_y": 0.0,
12
+ "delta_z": -0.03,
13
+ "qw": 0.99,
14
+ "qx": 0.01,
15
+ "qy": 0.01,
16
+ "qz": 0.01
17
+ },
18
+ {
19
+ "timestep": 1,
20
+ "delta_x": 0.004166666666666667,
21
+ "delta_y": 0.0016666666666666666,
22
+ "delta_z": -0.027499999999999997,
23
+ "qw": 0.99,
24
+ "qx": 0.01,
25
+ "qy": 0.01,
26
+ "qz": 0.01
27
+ },
28
+ {
29
+ "timestep": 2,
30
+ "delta_x": 0.008333333333333333,
31
+ "delta_y": 0.003333333333333333,
32
+ "delta_z": -0.025,
33
+ "qw": 0.99,
34
+ "qx": 0.01,
35
+ "qy": 0.01,
36
+ "qz": 0.01
37
+ },
38
+ {
39
+ "timestep": 3,
40
+ "delta_x": 0.0125,
41
+ "delta_y": 0.005,
42
+ "delta_z": -0.0225,
43
+ "qw": 0.99,
44
+ "qx": 0.01,
45
+ "qy": 0.01,
46
+ "qz": 0.01
47
+ },
48
+ {
49
+ "timestep": 4,
50
+ "delta_x": 0.016666666666666666,
51
+ "delta_y": 0.006666666666666666,
52
+ "delta_z": -0.02,
53
+ "qw": 0.99,
54
+ "qx": 0.01,
55
+ "qy": 0.01,
56
+ "qz": 0.01
57
+ },
58
+ {
59
+ "timestep": 5,
60
+ "delta_x": 0.020833333333333336,
61
+ "delta_y": 0.008333333333333333,
62
+ "delta_z": -0.017499999999999998,
63
+ "qw": 0.99,
64
+ "qx": 0.01,
65
+ "qy": 0.01,
66
+ "qz": 0.01
67
+ },
68
+ {
69
+ "timestep": 6,
70
+ "delta_x": 0.025,
71
+ "delta_y": 0.01,
72
+ "delta_z": -0.015,
73
+ "qw": 0.99,
74
+ "qx": 0.01,
75
+ "qy": 0.01,
76
+ "qz": 0.01
77
+ },
78
+ {
79
+ "timestep": 7,
80
+ "delta_x": 0.02916666666666667,
81
+ "delta_y": 0.011666666666666667,
82
+ "delta_z": -0.012499999999999999,
83
+ "qw": 0.99,
84
+ "qx": 0.01,
85
+ "qy": 0.01,
86
+ "qz": 0.01
87
+ },
88
+ {
89
+ "timestep": 8,
90
+ "delta_x": 0.03333333333333333,
91
+ "delta_y": 0.013333333333333332,
92
+ "delta_z": -0.01,
93
+ "qw": 0.99,
94
+ "qx": 0.01,
95
+ "qy": 0.01,
96
+ "qz": 0.01
97
+ },
98
+ {
99
+ "timestep": 9,
100
+ "delta_x": 0.037500000000000006,
101
+ "delta_y": 0.015,
102
+ "delta_z": -0.0075,
103
+ "qw": 0.99,
104
+ "qx": 0.01,
105
+ "qy": 0.01,
106
+ "qz": 0.01
107
+ },
108
+ {
109
+ "timestep": 10,
110
+ "delta_x": 0.04166666666666667,
111
+ "delta_y": 0.016666666666666666,
112
+ "delta_z": -0.004999999999999998,
113
+ "qw": 0.99,
114
+ "qx": 0.01,
115
+ "qy": 0.01,
116
+ "qz": 0.01
117
+ },
118
+ {
119
+ "timestep": 11,
120
+ "delta_x": 0.04583333333333334,
121
+ "delta_y": 0.018333333333333333,
122
+ "delta_z": -0.002500000000000001,
123
+ "qw": 0.99,
124
+ "qx": 0.01,
125
+ "qy": 0.01,
126
+ "qz": 0.01
127
+ }
128
+ ],
129
+ "gripper": [
130
+ 0,
131
+ 0,
132
+ 0,
133
+ 0,
134
+ 0,
135
+ 0,
136
+ 1,
137
+ 1,
138
+ 1,
139
+ 1,
140
+ 1,
141
+ 1
142
+ ],
143
+ "model_type": "Simulation",
144
+ "metadata": {
145
+ "device": "cuda",
146
+ "num_timesteps": 12
147
+ }
148
+ }
inference_results/pick_red_block_gripper.png ADDED
inference_results/pick_red_block_third.png ADDED
inference_results/place_in_drawer.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "place_in_drawer",
3
+ "instruction": "Place the red block in the drawer",
4
+ "description": "Put the grasped object into the open drawer",
5
+ "third_view_image": "place_in_drawer_third.png",
6
+ "gripper_view_image": "place_in_drawer_gripper.png",
7
+ "actions": [
8
+ {
9
+ "timestep": 0,
10
+ "delta_x": 0.0,
11
+ "delta_y": 0.0,
12
+ "delta_z": -0.03,
13
+ "qw": 0.99,
14
+ "qx": 0.01,
15
+ "qy": 0.01,
16
+ "qz": 0.01
17
+ },
18
+ {
19
+ "timestep": 1,
20
+ "delta_x": 0.004166666666666667,
21
+ "delta_y": 0.0016666666666666666,
22
+ "delta_z": -0.027499999999999997,
23
+ "qw": 0.99,
24
+ "qx": 0.01,
25
+ "qy": 0.01,
26
+ "qz": 0.01
27
+ },
28
+ {
29
+ "timestep": 2,
30
+ "delta_x": 0.008333333333333333,
31
+ "delta_y": 0.003333333333333333,
32
+ "delta_z": -0.025,
33
+ "qw": 0.99,
34
+ "qx": 0.01,
35
+ "qy": 0.01,
36
+ "qz": 0.01
37
+ },
38
+ {
39
+ "timestep": 3,
40
+ "delta_x": 0.0125,
41
+ "delta_y": 0.005,
42
+ "delta_z": -0.0225,
43
+ "qw": 0.99,
44
+ "qx": 0.01,
45
+ "qy": 0.01,
46
+ "qz": 0.01
47
+ },
48
+ {
49
+ "timestep": 4,
50
+ "delta_x": 0.016666666666666666,
51
+ "delta_y": 0.006666666666666666,
52
+ "delta_z": -0.02,
53
+ "qw": 0.99,
54
+ "qx": 0.01,
55
+ "qy": 0.01,
56
+ "qz": 0.01
57
+ },
58
+ {
59
+ "timestep": 5,
60
+ "delta_x": 0.020833333333333336,
61
+ "delta_y": 0.008333333333333333,
62
+ "delta_z": -0.017499999999999998,
63
+ "qw": 0.99,
64
+ "qx": 0.01,
65
+ "qy": 0.01,
66
+ "qz": 0.01
67
+ },
68
+ {
69
+ "timestep": 6,
70
+ "delta_x": 0.025,
71
+ "delta_y": 0.01,
72
+ "delta_z": -0.015,
73
+ "qw": 0.99,
74
+ "qx": 0.01,
75
+ "qy": 0.01,
76
+ "qz": 0.01
77
+ },
78
+ {
79
+ "timestep": 7,
80
+ "delta_x": 0.02916666666666667,
81
+ "delta_y": 0.011666666666666667,
82
+ "delta_z": -0.012499999999999999,
83
+ "qw": 0.99,
84
+ "qx": 0.01,
85
+ "qy": 0.01,
86
+ "qz": 0.01
87
+ },
88
+ {
89
+ "timestep": 8,
90
+ "delta_x": 0.03333333333333333,
91
+ "delta_y": 0.013333333333333332,
92
+ "delta_z": -0.01,
93
+ "qw": 0.99,
94
+ "qx": 0.01,
95
+ "qy": 0.01,
96
+ "qz": 0.01
97
+ },
98
+ {
99
+ "timestep": 9,
100
+ "delta_x": 0.037500000000000006,
101
+ "delta_y": 0.015,
102
+ "delta_z": -0.0075,
103
+ "qw": 0.99,
104
+ "qx": 0.01,
105
+ "qy": 0.01,
106
+ "qz": 0.01
107
+ },
108
+ {
109
+ "timestep": 10,
110
+ "delta_x": 0.04166666666666667,
111
+ "delta_y": 0.016666666666666666,
112
+ "delta_z": -0.004999999999999998,
113
+ "qw": 0.99,
114
+ "qx": 0.01,
115
+ "qy": 0.01,
116
+ "qz": 0.01
117
+ },
118
+ {
119
+ "timestep": 11,
120
+ "delta_x": 0.04583333333333334,
121
+ "delta_y": 0.018333333333333333,
122
+ "delta_z": -0.002500000000000001,
123
+ "qw": 0.99,
124
+ "qx": 0.01,
125
+ "qy": 0.01,
126
+ "qz": 0.01
127
+ }
128
+ ],
129
+ "gripper": [
130
+ 0,
131
+ 0,
132
+ 0,
133
+ 0,
134
+ 0,
135
+ 0,
136
+ 1,
137
+ 1,
138
+ 1,
139
+ 1,
140
+ 1,
141
+ 1
142
+ ],
143
+ "model_type": "Simulation",
144
+ "metadata": {
145
+ "device": "cuda",
146
+ "num_timesteps": 12
147
+ }
148
+ }
inference_results/place_in_drawer_gripper.png ADDED
inference_results/place_in_drawer_third.png ADDED
inference_results/slide_block.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "slide_block",
3
+ "instruction": "Slide the block to the left",
4
+ "description": "Push the block horizontally across the table",
5
+ "third_view_image": "slide_block_third.png",
6
+ "gripper_view_image": "slide_block_gripper.png",
7
+ "actions": [
8
+ {
9
+ "timestep": 0,
10
+ "delta_x": 0.0,
11
+ "delta_y": 0.0,
12
+ "delta_z": -0.03,
13
+ "qw": 0.99,
14
+ "qx": 0.01,
15
+ "qy": 0.01,
16
+ "qz": 0.01
17
+ },
18
+ {
19
+ "timestep": 1,
20
+ "delta_x": 0.004166666666666667,
21
+ "delta_y": 0.0016666666666666666,
22
+ "delta_z": -0.027499999999999997,
23
+ "qw": 0.99,
24
+ "qx": 0.01,
25
+ "qy": 0.01,
26
+ "qz": 0.01
27
+ },
28
+ {
29
+ "timestep": 2,
30
+ "delta_x": 0.008333333333333333,
31
+ "delta_y": 0.003333333333333333,
32
+ "delta_z": -0.025,
33
+ "qw": 0.99,
34
+ "qx": 0.01,
35
+ "qy": 0.01,
36
+ "qz": 0.01
37
+ },
38
+ {
39
+ "timestep": 3,
40
+ "delta_x": 0.0125,
41
+ "delta_y": 0.005,
42
+ "delta_z": -0.0225,
43
+ "qw": 0.99,
44
+ "qx": 0.01,
45
+ "qy": 0.01,
46
+ "qz": 0.01
47
+ },
48
+ {
49
+ "timestep": 4,
50
+ "delta_x": 0.016666666666666666,
51
+ "delta_y": 0.006666666666666666,
52
+ "delta_z": -0.02,
53
+ "qw": 0.99,
54
+ "qx": 0.01,
55
+ "qy": 0.01,
56
+ "qz": 0.01
57
+ },
58
+ {
59
+ "timestep": 5,
60
+ "delta_x": 0.020833333333333336,
61
+ "delta_y": 0.008333333333333333,
62
+ "delta_z": -0.017499999999999998,
63
+ "qw": 0.99,
64
+ "qx": 0.01,
65
+ "qy": 0.01,
66
+ "qz": 0.01
67
+ },
68
+ {
69
+ "timestep": 6,
70
+ "delta_x": 0.025,
71
+ "delta_y": 0.01,
72
+ "delta_z": -0.015,
73
+ "qw": 0.99,
74
+ "qx": 0.01,
75
+ "qy": 0.01,
76
+ "qz": 0.01
77
+ },
78
+ {
79
+ "timestep": 7,
80
+ "delta_x": 0.02916666666666667,
81
+ "delta_y": 0.011666666666666667,
82
+ "delta_z": -0.012499999999999999,
83
+ "qw": 0.99,
84
+ "qx": 0.01,
85
+ "qy": 0.01,
86
+ "qz": 0.01
87
+ },
88
+ {
89
+ "timestep": 8,
90
+ "delta_x": 0.03333333333333333,
91
+ "delta_y": 0.013333333333333332,
92
+ "delta_z": -0.01,
93
+ "qw": 0.99,
94
+ "qx": 0.01,
95
+ "qy": 0.01,
96
+ "qz": 0.01
97
+ },
98
+ {
99
+ "timestep": 9,
100
+ "delta_x": 0.037500000000000006,
101
+ "delta_y": 0.015,
102
+ "delta_z": -0.0075,
103
+ "qw": 0.99,
104
+ "qx": 0.01,
105
+ "qy": 0.01,
106
+ "qz": 0.01
107
+ },
108
+ {
109
+ "timestep": 10,
110
+ "delta_x": 0.04166666666666667,
111
+ "delta_y": 0.016666666666666666,
112
+ "delta_z": -0.004999999999999998,
113
+ "qw": 0.99,
114
+ "qx": 0.01,
115
+ "qy": 0.01,
116
+ "qz": 0.01
117
+ },
118
+ {
119
+ "timestep": 11,
120
+ "delta_x": 0.04583333333333334,
121
+ "delta_y": 0.018333333333333333,
122
+ "delta_z": -0.002500000000000001,
123
+ "qw": 0.99,
124
+ "qx": 0.01,
125
+ "qy": 0.01,
126
+ "qz": 0.01
127
+ }
128
+ ],
129
+ "gripper": [
130
+ 0,
131
+ 0,
132
+ 0,
133
+ 0,
134
+ 0,
135
+ 0,
136
+ 1,
137
+ 1,
138
+ 1,
139
+ 1,
140
+ 1,
141
+ 1
142
+ ],
143
+ "model_type": "Simulation",
144
+ "metadata": {
145
+ "device": "cuda",
146
+ "num_timesteps": 12
147
+ }
148
+ }
inference_results/slide_block_gripper.png ADDED
inference_results/slide_block_third.png ADDED
requirements.txt CHANGED
@@ -1,9 +1,4 @@
1
  gradio==4.8.0
2
- torch==2.1.0
3
- torchvision==0.16.0
4
- transformers==4.35.0
5
- huggingface_hub>=0.16.4,<1.0
6
- einops==0.7.0
7
  numpy==1.24.3
8
  Pillow==10.1.0
9
- matplotlib==3.8.0
 
1
  gradio==4.8.0
 
 
 
 
 
2
  numpy==1.24.3
3
  Pillow==10.1.0
4
+ matplotlib==3.8.0