divisor.dimoo.config

Configuration file Contains commonly used parameters and settings

View Source

 1# SPDX-License-Identifier: Apache-2.0
 2# Adapted from https://github.com/Alpha-VLLM/Lumina-DiMOO
 3
 4# -*- coding: utf-8 -*-
 5"""
 6Configuration file
 7Contains commonly used parameters and settings
 8"""
 9
10# Generation related configuration
11GENERATION_CONFIG = {
12    "default_timesteps": 64,
13    "default_temperature": 1.0,
14    "default_cfg_scale": 4.0,
15    "default_cfg_img": 4.0,
16    "default_seq_len": 1024,
17    "default_newline_every": 16,
18    "remasking_strategy": "low_confidence",
19}
20
21# Image related configuration
22IMAGE_CONFIG = {
23    "default_height": 512,
24    "default_width": 512,
25    "max_height": 1024,
26    "max_width": 1024,
27}
28
29# Special token IDs
30SPECIAL_TOKENS = {
31    "mask_token": 126336,
32    "newline_token": 126084,
33    "image_token_offset": 126356,
34    "answer_start": 126354,
35    "answer_end": 126355,
36    "boi": 126349,  # begin of image
37    "eoi": 126350,  # end of image
38    "uncondition": 126351,
39}
40
41# Prompt templates
42PROMPT_TEMPLATES = {
43    "text_understanding": "You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer.",
44    "image_generation": "Generate an image according to the text prompt.",
45    "image_editing": "Generate an image applying the following editing instruction based on the original image.",
46    "dense_prediction": "Perform dense prediction on the given images.",
47    "control_generation": "Generate an image according to the text prompt and the given control image.",
48    "subject_generation": "Generate an image according to the text prompt and the given object image.",
49    "multi_view": "Generate a view-image based on the given image.",
50    "style_transfer": "Transform the current image into the style of the provided image.",
51}
52
53# Edit type configuration
54EDIT_TYPE_CONFIG = {
55    "dense": {"canny": "canny edge map", "hed": "hed edge map", "depth": "depth map", "openpose": "pose estimation map"},
56    "supported_types": [
57        "canny_pred",
58        "hed_pred",
59        "depth_pred",
60        "openpose_pred",
61        "canny_control",
62        "hed_control",
63        "depth_control",
64        "openpose_control",
65        "subject_driven",
66        "edit",
67        "ref_transfer",
68        "multi_view",
69    ],
70}

GENERATION_CONFIG = {'default_timesteps': 64, 'default_temperature': 1.0, 'default_cfg_scale': 4.0, 'default_cfg_img': 4.0, 'default_seq_len': 1024, 'default_newline_every': 16, 'remasking_strategy': 'low_confidence'}

IMAGE_CONFIG = {'default_height': 512, 'default_width': 512, 'max_height': 1024, 'max_width': 1024}

SPECIAL_TOKENS = {'mask_token': 126336, 'newline_token': 126084, 'image_token_offset': 126356, 'answer_start': 126354, 'answer_end': 126355, 'boi': 126349, 'eoi': 126350, 'uncondition': 126351}

PROMPT_TEMPLATES = {'text_understanding': 'You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer.', 'image_generation': 'Generate an image according to the text prompt.', 'image_editing': 'Generate an image applying the following editing instruction based on the original image.', 'dense_prediction': 'Perform dense prediction on the given images.', 'control_generation': 'Generate an image according to the text prompt and the given control image.', 'subject_generation': 'Generate an image according to the text prompt and the given object image.', 'multi_view': 'Generate a view-image based on the given image.', 'style_transfer': 'Transform the current image into the style of the provided image.'}

EDIT_TYPE_CONFIG = {'dense': {'canny': 'canny edge map', 'hed': 'hed edge map', 'depth': 'depth map', 'openpose': 'pose estimation map'}, 'supported_types': ['canny_pred', 'hed_pred', 'depth_pred', 'openpose_pred', 'canny_control', 'hed_control', 'depth_control', 'openpose_control', 'subject_driven', 'edit', 'ref_transfer', 'multi_view']}