divisor.dimoo.config
Configuration file Contains commonly used parameters and settings
1# SPDX-License-Identifier: Apache-2.0 2# Adapted from https://github.com/Alpha-VLLM/Lumina-DiMOO 3 4# -*- coding: utf-8 -*- 5""" 6Configuration file 7Contains commonly used parameters and settings 8""" 9 10# Generation related configuration 11GENERATION_CONFIG = { 12 "default_timesteps": 64, 13 "default_temperature": 1.0, 14 "default_cfg_scale": 4.0, 15 "default_cfg_img": 4.0, 16 "default_seq_len": 1024, 17 "default_newline_every": 16, 18 "remasking_strategy": "low_confidence", 19} 20 21# Image related configuration 22IMAGE_CONFIG = { 23 "default_height": 512, 24 "default_width": 512, 25 "max_height": 1024, 26 "max_width": 1024, 27} 28 29# Special token IDs 30SPECIAL_TOKENS = { 31 "mask_token": 126336, 32 "newline_token": 126084, 33 "image_token_offset": 126356, 34 "answer_start": 126354, 35 "answer_end": 126355, 36 "boi": 126349, # begin of image 37 "eoi": 126350, # end of image 38 "uncondition": 126351, 39} 40 41# Prompt templates 42PROMPT_TEMPLATES = { 43 "text_understanding": "You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer.", 44 "image_generation": "Generate an image according to the text prompt.", 45 "image_editing": "Generate an image applying the following editing instruction based on the original image.", 46 "dense_prediction": "Perform dense prediction on the given images.", 47 "control_generation": "Generate an image according to the text prompt and the given control image.", 48 "subject_generation": "Generate an image according to the text prompt and the given object image.", 49 "multi_view": "Generate a view-image based on the given image.", 50 "style_transfer": "Transform the current image into the style of the provided image.", 51} 52 53# Edit type configuration 54EDIT_TYPE_CONFIG = { 55 "dense": {"canny": "canny edge map", "hed": "hed edge map", "depth": "depth map", "openpose": "pose estimation map"}, 56 "supported_types": [ 57 "canny_pred", 58 "hed_pred", 59 "depth_pred", 60 "openpose_pred", 61 "canny_control", 62 "hed_control", 63 "depth_control", 64 "openpose_control", 65 "subject_driven", 66 "edit", 67 "ref_transfer", 68 "multi_view", 69 ], 70}
GENERATION_CONFIG =
{'default_timesteps': 64, 'default_temperature': 1.0, 'default_cfg_scale': 4.0, 'default_cfg_img': 4.0, 'default_seq_len': 1024, 'default_newline_every': 16, 'remasking_strategy': 'low_confidence'}
IMAGE_CONFIG =
{'default_height': 512, 'default_width': 512, 'max_height': 1024, 'max_width': 1024}
SPECIAL_TOKENS =
{'mask_token': 126336, 'newline_token': 126084, 'image_token_offset': 126356, 'answer_start': 126354, 'answer_end': 126355, 'boi': 126349, 'eoi': 126350, 'uncondition': 126351}
PROMPT_TEMPLATES =
{'text_understanding': 'You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer.', 'image_generation': 'Generate an image according to the text prompt.', 'image_editing': 'Generate an image applying the following editing instruction based on the original image.', 'dense_prediction': 'Perform dense prediction on the given images.', 'control_generation': 'Generate an image according to the text prompt and the given control image.', 'subject_generation': 'Generate an image according to the text prompt and the given object image.', 'multi_view': 'Generate a view-image based on the given image.', 'style_transfer': 'Transform the current image into the style of the provided image.'}
EDIT_TYPE_CONFIG =
{'dense': {'canny': 'canny edge map', 'hed': 'hed edge map', 'depth': 'depth map', 'openpose': 'pose estimation map'}, 'supported_types': ['canny_pred', 'hed_pred', 'depth_pred', 'openpose_pred', 'canny_control', 'hed_control', 'depth_control', 'openpose_control', 'subject_driven', 'edit', 'ref_transfer', 'multi_view']}