{"payload":{"pageCount":2,"repositories":[{"type":"Public","name":"InternVL","owner":"OpenGVLab","isFork":false,"description":"[CVPR 2024 Oral] InternVL Family: A Pioneering Open-Source Alternative to GPT-4V. 接近GPT-4V表现的可商用开源多模态对话模型","allTopics":["image-classification","gpt","multi-modal","semantic-segmentation","video-classification","mme","image-text-retrieval","llm","vision-language-model","gpt-4v","vit-6b","vit-22b","gpt-4o"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":94,"starsCount":2931,"forksCount":217,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,6,1,5,4,7,6,2,1,9,6,2,0,1,1,0,1,3,17,2,8,14,5],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-29T09:26:35.474Z"}},{"type":"Public","name":"Ask-Anything","owner":"OpenGVLab","isFork":false,"description":"[CVPR2024 Highlight][VideoChatGPT] ChatGPT with video understanding! And many more supported LMs such as miniGPT4, StableLM, and MOSS.","allTopics":["chat","video","gradio","big-model","video-understanding","captioning-videos","video-question-answering","foundation-models","large-model","large-language-models","chatgpt","langchain","stablelm"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":3,"issueCount":72,"starsCount":2758,"forksCount":223,"license":"MIT License","participation":[1,2,2,1,0,0,4,1,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,3,6,0,0,1,0,1,2,0,0,0,3,1,1,1,1,0,7,0,0,0,0,0,3,10],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-28T15:06:47.295Z"}},{"type":"Public","name":"unmasked_teacher","owner":"OpenGVLab","isFork":false,"description":"[ICCV2023 Oral] Unmasked Teacher: Towards Training-Efficient Video Foundation Models","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":5,"starsCount":252,"forksCount":12,"license":"MIT License","participation":[0,0,0,0,0,0,0,3,0,0,0,2,2,0,0,1,3,0,1,0,2,4,3,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-27T05:22:56.986Z"}},{"type":"Public","name":"InternVideo","owner":"OpenGVLab","isFork":false,"description":"Video Foundation Models & Data for Multimodal Understanding","allTopics":["benchmark","action-recognition","video-understanding","video-data","self-supervised","multimodal","video-dataset","open-set-recognition","video-retrieval","video-question-answering","masked-autoencoder","temporal-action-localization","contrastive-learning","spatio-temporal-action-localization","zero-shot-retrieval","video-clip","vision-transformer","zero-shot-classification","foundation-models","instruction-tuning"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":3,"issueCount":50,"starsCount":1018,"forksCount":69,"license":"Apache License 2.0","participation":[0,0,0,0,1,0,8,3,0,0,0,0,0,0,4,0,0,0,0,0,2,11,0,0,0,3,2,0,0,0,0,0,0,1,7,1,0,0,0,0,0,0,0,10,1,8,3,8,0,0,4,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-26T15:16:30.863Z"}},{"type":"Public","name":"MMT-Bench","owner":"OpenGVLab","isFork":false,"description":"ICML'2024 | MMT-Bench: A Comprehensive Multimodal Benchmark for Evaluating Large Vision-Language Models Towards Multitask AGI","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":3,"starsCount":39,"forksCount":0,"license":null,"participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,5,0,0,1,1],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-23T11:01:18.403Z"}},{"type":"Public","name":"ChartAst","owner":"OpenGVLab","isFork":false,"description":" ChartAssistant is a chart-based vision-language model for universal chart comprehension and reasoning.","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":8,"starsCount":60,"forksCount":4,"license":"Other","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,18,19,3,1,0,0,3,7,0,0,1,0,0,0,0,0,0,0,0,7,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-21T11:09:53.410Z"}},{"type":"Public","name":".github","owner":"OpenGVLab","isFork":false,"description":"","allTopics":[],"primaryLanguage":null,"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":0,"license":null,"participation":[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,2,0,0,11,0,0,0,2,0,1,1,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-21T08:51:18.135Z"}},{"type":"Public","name":"DCNv4","owner":"OpenGVLab","isFork":false,"description":"[CVPR 2024] Deformable Convolution v4","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":36,"starsCount":359,"forksCount":23,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,3,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-17T04:00:21.269Z"}},{"type":"Public","name":"video-mamba-suite","owner":"OpenGVLab","isFork":false,"description":"The suite of modeling video with Mamba","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":1,"issueCount":7,"starsCount":160,"forksCount":15,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,17,9,17,0,0,0,0,1,2,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-14T03:23:44.558Z"}},{"type":"Public","name":"STM-Evaluation","owner":"OpenGVLab","isFork":false,"description":"","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":1,"starsCount":69,"forksCount":6,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-09T05:41:04.120Z"}},{"type":"Public","name":"InternImage","owner":"OpenGVLab","isFork":false,"description":"[CVPR 2023 Highlight] InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions","allTopics":["backbone","semantic-segmentation","deformable-convolution","foundation-model","object-detection"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":5,"issueCount":169,"starsCount":2356,"forksCount":227,"license":"MIT License","participation":[1,0,0,0,0,1,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-29T22:23:01.417Z"}},{"type":"Public","name":"PonderV2","owner":"OpenGVLab","isFork":false,"description":"PonderV2: Pave the Way for 3D Foundation Model with A Universal Pre-training Paradigm","allTopics":["3d-vision","pretraining","foundation-models"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":4,"starsCount":300,"forksCount":5,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,3,0,0,0,0,0,0,0,1,0,0,0,0,1,3,5,3,1,1,6,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-25T11:38:08.991Z"}},{"type":"Public","name":"EgoExoLearn","owner":"OpenGVLab","isFork":false,"description":"Data and benchmark code for the EgoExoLearn dataset","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":1,"starsCount":27,"forksCount":0,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,6,0,0,2,1,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-24T03:31:34.096Z"}},{"type":"Public","name":"Multi-Modality-Arena","owner":"OpenGVLab","isFork":false,"description":"Chatbot Arena meets multi-modality! Multi-Modality Arena allows you to benchmark vision-language models side-by-side while providing images as inputs. Supports MiniGPT-4, LLaMA-Adapter V2, LLaVA, BLIP-2, and many more!","allTopics":["chat","chatbot","vqa","gradio","multi-modality","large-language-models","llms","chatgpt","vision-language-model"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":14,"starsCount":390,"forksCount":26,"license":null,"participation":[4,4,5,3,12,7,3,1,0,4,1,0,3,1,1,1,0,0,0,2,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,11,1,0,0,1,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-21T11:14:46.520Z"}},{"type":"Public","name":"LAMM","owner":"OpenGVLab","isFork":false,"description":"[NeurIPS 2023 Datasets and Benchmarks Track] LAMM: Multi-Modal Large Language Models and Applications as AI Agents","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":7,"starsCount":270,"forksCount":14,"license":null,"participation":[0,21,11,8,10,6,2,3,7,0,0,6,4,13,0,0,0,0,0,2,15,27,24,4,0,0,0,0,0,2,0,0,0,1,3,15,20,3,0,0,0,0,4,4,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-16T11:30:23.302Z"}},{"type":"Public","name":"DiffAgent","owner":"OpenGVLab","isFork":false,"description":"[CVPR 2024] DiffAgent: Fast and Accurate Text-to-Image API Selection with Large Language Model","allTopics":[],"primaryLanguage":null,"pullRequestCount":0,"issueCount":0,"starsCount":14,"forksCount":0,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,4,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-16T03:57:45.379Z"}},{"type":"Public","name":"InternVideo2","owner":"OpenGVLab","isFork":false,"description":"","allTopics":[],"primaryLanguage":null,"pullRequestCount":0,"issueCount":2,"starsCount":166,"forksCount":1,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-15T07:35:58.603Z"}},{"type":"Public","name":"Vision-RWKV","owner":"OpenGVLab","isFork":false,"description":"Vision-RWKV: Efficient and Scalable Visual Perception with RWKV-Like Architectures","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":13,"starsCount":258,"forksCount":9,"license":"Apache License 2.0","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,2,0,0,0,0,3,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-14T15:06:38.378Z"}},{"type":"Public","name":"VideoMamba","owner":"OpenGVLab","isFork":false,"description":"VideoMamba: State Space Model for Efficient Video Understanding","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":12,"starsCount":628,"forksCount":44,"license":"Apache License 2.0","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,11,2,3,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-10T18:57:28.146Z"}},{"type":"Public","name":"Instruct2Act","owner":"OpenGVLab","isFork":false,"description":"Instruct2Act: Mapping Multi-modality Instructions to Robotic Actions with Large Language Model","allTopics":["robotics","clip","llm","chatgpt","segment-anything"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":0,"starsCount":275,"forksCount":18,"license":null,"participation":[0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-09T08:00:37.391Z"}},{"type":"Public","name":"MM-Interleaved","owner":"OpenGVLab","isFork":false,"description":"MM-Interleaved: Interleaved Image-Text Generative Modeling via Multi-modal Feature Synchronizer ","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":6,"starsCount":161,"forksCount":10,"license":"Apache License 2.0","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-03T03:09:27.863Z"}},{"type":"Public","name":"UniFormerV2","owner":"OpenGVLab","isFork":false,"description":"[ICCV2023] UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":1,"issueCount":9,"starsCount":268,"forksCount":15,"license":"Apache License 2.0","participation":[0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-02T17:00:02.899Z"}},{"type":"Public","name":"Hulk","owner":"OpenGVLab","isFork":false,"description":"An official implementation of \"Hulk: A Universal Knowledge Translator for Human-Centric Tasks\"","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":2,"starsCount":57,"forksCount":2,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,1,0,2,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-02T07:41:39.777Z"}},{"type":"Public","name":"InternVL-MMDetSeg","owner":"OpenGVLab","isFork":false,"description":"Train InternViT-6B in MMSegmentation and MMDetection with DeepSpeed","allTopics":["object-detection","semantic-segmentation","vision-foundation"],"primaryLanguage":{"name":"Jupyter Notebook","color":"#DA5B0B"},"pullRequestCount":0,"issueCount":1,"starsCount":35,"forksCount":2,"license":null,"participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,12,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-03-27T08:42:16.030Z"}},{"type":"Public","name":"SAM-Med2D","owner":"OpenGVLab","isFork":false,"description":"Official implementation of SAM-Med2D","allTopics":[],"primaryLanguage":{"name":"Jupyter Notebook","color":"#DA5B0B"},"pullRequestCount":1,"issueCount":37,"starsCount":773,"forksCount":70,"license":"Apache License 2.0","participation":[0,0,0,0,0,0,0,0,0,0,0,0,6,43,12,4,0,0,2,0,2,1,0,0,2,6,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-03-25T12:13:23.399Z"}},{"type":"Public","name":"all-seeing","owner":"OpenGVLab","isFork":false,"description":"[ICLR 2024] This is the official implementation of the paper \"The All-Seeing Project: Towards Panoptic Visual Recognition and Understanding of the Open World\"","allTopics":["dataset","all-seeing","region-text"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":5,"starsCount":388,"forksCount":12,"license":null,"participation":[0,0,0,0,0,0,0,0,0,10,0,0,1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,2,0,2,0,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-03-22T12:15:37.667Z"}},{"type":"Public","name":"OmniQuant","owner":"OpenGVLab","isFork":false,"description":"[ICLR2024 spotlight] OmniQuant is a simple and powerful quantization technique for LLMs. ","allTopics":["quantization","large-language-models","llm"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":1,"issueCount":18,"starsCount":601,"forksCount":45,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,4,7,3,4,4,3,0,0,0,0,2,0,2,0,2,0,0,6,1,7,0,0,7,0,2,0,0,1,2,0,2,0,0,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-03-19T07:18:08.263Z"}},{"type":"Public","name":"VideoMAEv2","owner":"OpenGVLab","isFork":false,"description":"[CVPR 2023] VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking","allTopics":["video-understanding","action-detection","self-supervised-learning","temporal-action-detection","foundation-model","cvpr2023","action-recognition"],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":7,"starsCount":428,"forksCount":42,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-03-14T12:43:53.295Z"}},{"type":"Public","name":"LLaMA-Adapter","owner":"OpenGVLab","isFork":false,"description":"[ICLR 2024] Fine-tuning LLaMA to follow Instructions within 1 Hour and 1.2M Parameters","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":4,"issueCount":104,"starsCount":5558,"forksCount":363,"license":"GNU General Public License v3.0","participation":[28,9,2,0,9,6,0,5,2,0,0,2,1,0,0,2,0,1,0,3,0,0,1,1,0,0,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-03-14T08:12:53.067Z"}},{"type":"Public","name":"HumanBench","owner":"OpenGVLab","isFork":false,"description":"This repo is official implementation of HumanBench (CVPR2023)","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":1,"issueCount":14,"starsCount":211,"forksCount":9,"license":"MIT License","participation":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0],"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-03-12T00:44:46.366Z"}}],"repositoryCount":57,"userInfo":null,"searchable":true,"definitions":[],"typeFilters":[{"id":"all","text":"All"},{"id":"public","text":"Public"},{"id":"source","text":"Sources"},{"id":"fork","text":"Forks"},{"id":"archived","text":"Archived"},{"id":"template","text":"Templates"}],"compactMode":false},"title":"Repositories"}