Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Draft] Fix sp docs #563

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
b598b36
[Docs] Readthedocs (#304)
pppppM Jan 10, 2024
ff60399
[Fix] Support ZH Readthedocs (#305)
pppppM Jan 10, 2024
55d8424
update conf.py
pppppM Jan 10, 2024
b1d76e0
[Docs] Document optimization (#362)
crazysteeaam Jan 26, 2024
1505301
[Docs] Update Docs docs/en/get_started/installation.md (#364)
JimmyMa99 Feb 28, 2024
d994fb6
[Docs] Refine Quick Start (#378)
fanqiNO1 Mar 6, 2024
7e07bf9
Update overview.md (#412)
Jianfeng777 Mar 6, 2024
c360cb8
Merge branch 'main' into 'docs' (#463)
LZHgrla Mar 11, 2024
d01b5e6
[Docs] Add `docs/zh_cn/preparation/pretrained_model.md` (#462)
LZHgrla Mar 26, 2024
7dfab43
[Docs] Add `docs/zh_cn/training/multi_modal_dataset.md` (#503)
LZHgrla Mar 26, 2024
a322df4
[Docs] Improve readthedocs style (#545)
LZHgrla Apr 3, 2024
60060c2
[Docs] `.md` to `.rst` (#544)
LZHgrla Apr 9, 2024
0789534
[Docs] Add `docs/zh_cn/training/custom_pretrain_dataset.rst` (#535)
LZHgrla Apr 9, 2024
7d5bd74
[Docs] Add docs about training on large scale dataset (#517)
HIT-cwh Apr 9, 2024
efa906d
[Docs] Add internevo migration related documents (#506)
HIT-cwh Apr 9, 2024
374e854
[Docs] Add `docs/zh_cn/training/modify_settings.rst` (#490)
LZHgrla Apr 9, 2024
8db28af
[Docs] Add `length_grouped_sampler.rst` (#511)
LZHgrla Apr 9, 2024
ea6f03c
[Docs] Add accelerate related (#504)
HIT-cwh Apr 9, 2024
a30aa79
[Docs] Add visualization docs (#516)
HIT-cwh Apr 9, 2024
980dfcc
[Docs] Add docs about SFT with custom dataset (#514)
HIT-cwh Apr 9, 2024
ceee23f
[Docs] Add `docs/zh_cn/training/open_source_dataset.rst` (#502)
LZHgrla Apr 9, 2024
636004d
[Docs] Add `docs/zh_cn/preparation/prompt_template.rst` (#475)
LZHgrla Apr 9, 2024
f711a19
[Docs] Add Sequence Parallel documents (#505)
HIT-cwh Apr 9, 2024
3678629
Merge branch 'main' into docs
pppppM Apr 9, 2024
30d4ff6
delete pt version limitation
HIT-cwh Apr 9, 2024
8b1e997
[Docs] Update `docs/zh_cn` outline (#556)
LZHgrla Apr 10, 2024
f125a89
[Docs] Update `docs/en` theme (#557)
LZHgrla Apr 10, 2024
fa12b94
fix sp docs
HIT-cwh Apr 10, 2024
6afbf7f
Merge branch 'docs' of github.com:InternLM/xtuner into fix_sp_docs
HIT-cwh Apr 11, 2024
811ef92
move benchmark
HIT-cwh Apr 11, 2024
be5be20
[Fix] Fix typo (#547)
KooSung Apr 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# 速度基准

请参考 [速度基准文档](https://github.com/InternLM/xtuner/tree/main/docs/zh_cn/acceleration/benchmark.rst)
212 changes: 212 additions & 0 deletions benchmark/llama2_70b/llama2_70b_full_alpaca_enzh_128k_sp8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
# Copyright (c) OpenMMLab. All rights reserved.
from datasets import load_dataset
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from torch.optim import AdamW
from transformers import AutoModelForCausalLM, AutoTokenizer

from xtuner.dataset import ConcatDataset, process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn,
template_map_fn_factory)
from xtuner.engine.hooks import ThroughputHook, VarlenAttnArgsToMessageHubHook
from xtuner.engine.runner import TrainLoop
from xtuner.model import SupervisedFinetune
from xtuner.parallel.sequence import SequenceParallelSampler
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE

#######################################################################
# PART 1 Settings #
#######################################################################
# Model
pretrained_model_name_or_path = 'meta-llama/Llama-2-70b-hf'
use_varlen_attn = False
sequence_parallel_size = 8

# Data
alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
alpaca_en_path = 'tatsu-lab/alpaca'
prompt_template = PROMPT_TEMPLATE.llama2_chat
max_length = 131072 # 128k
pack_to_max_length = True

# Scheduler & Optimizer
batch_size = 1 # per_device
# Suppose I aim to employ a training strategy using a batch size per device
# of 1 with a maximum length of `max_length` on N GPUs.
# Upon setting the sequence parallelism dimension to `SP`,
# the accumulative counts have to be adjusted to `SP` times the original value.
# This modification is essential to assure training equivalence,
# as the sequence of `max_length` length will be segmented into `SP` parts,
# with each part being allocated to its respective GPU among the `SP` GPUs
# for parallelized training.
# bs = 32 gpus * 1 batch_size_per_device * 8 acc / 8 sequence parallel
accumulative_counts = 8
dataloader_num_workers = 4
max_epochs = 3
optim_type = AdamW
lr = 2e-5
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
log_interval = 1

# Save
save_steps = -1 # speed only
save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)

# Evaluate the generation performance during the training
evaluation_freq = 50
SYSTEM = SYSTEM_TEMPLATE.alpaca
evaluation_inputs = [
'请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
]

#######################################################################
# PART 2 Model & Tokenizer #
#######################################################################
tokenizer = dict(
type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
padding_side='right')

model = dict(
type=SupervisedFinetune,
use_varlen_attn=use_varlen_attn,
llm=dict(
type=AutoModelForCausalLM.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True))

#######################################################################
# PART 3 Dataset & Dataloader #
#######################################################################
alpaca_en = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path=alpaca_en_path),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=alpaca_map_fn,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length,
use_varlen_attn=use_varlen_attn)

alpaca_zh = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path=alpaca_zh_path),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=alpaca_zh_map_fn,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length,
use_varlen_attn=use_varlen_attn)

train_dataset = dict(type=ConcatDataset, datasets=[alpaca_en, alpaca_zh])

train_dataloader = dict(
batch_size=batch_size,
num_workers=dataloader_num_workers,
dataset=train_dataset,
sampler=dict(type=SequenceParallelSampler, seed=1024),
collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))

#######################################################################
# PART 4 Scheduler & Optimizer #
#######################################################################
# optimizer
optim_wrapper = dict(
type=AmpOptimWrapper,
optimizer=dict(
type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
accumulative_counts=accumulative_counts,
loss_scale='dynamic',
dtype='float16')

# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
param_scheduler = [
dict(
type=LinearLR,
start_factor=1e-5,
by_epoch=True,
begin=0,
end=warmup_ratio * max_epochs,
convert_to_iter_based=True),
dict(
type=CosineAnnealingLR,
eta_min=0.0,
by_epoch=True,
begin=warmup_ratio * max_epochs,
end=max_epochs,
convert_to_iter_based=True)
]

# train, val, test setting
train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)

#######################################################################
# PART 5 Runtime #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [dict(type=ThroughputHook)]

if use_varlen_attn:
custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]

# configure default hooks
default_hooks = dict(
# record the time of every iteration.
timer=dict(type=IterTimerHook),
# print log every 10 iterations.
logger=dict(
type=LoggerHook, log_metric_by_epoch=False, interval=log_interval),
# enable the parameter scheduler.
param_scheduler=dict(type=ParamSchedulerHook),
# save checkpoint per `save_steps`.
checkpoint=dict(
type=CheckpointHook,
by_epoch=False,
interval=-1,
save_last=False,
max_keep_ckpts=save_total_limit),
# set sampler seed in distributed evrionment.
sampler_seed=dict(type=DistSamplerSeedHook),
)

# configure environment
env_cfg = dict(
# whether to enable cudnn benchmark
cudnn_benchmark=False,
# set multi process parameters
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
# set distributed parameters
dist_cfg=dict(backend='nccl'),
)

# set visualizer
visualizer = None

# set log level
log_level = 'INFO'

# load from which checkpoint
load_from = None

# whether to resume training from the loaded checkpoint
resume = False

# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)

# set log processor
log_processor = dict(by_epoch=False, window_size=log_interval)