An open-source implementation of LongVA for facilitating the large multi-modal model community.
{
"id": id of the video/image data,
"video": dir to the video/image data,
"conversations": [
{
"from": "human",
"value": "\n Instruction"
},
{
"from": "gpt",
"value": "Output"
},
{
"from": "human",
"value": "
\n Instruction"
},
{
"from": "gpt",
"value": "Output"
}
]
},