한국정보과학회 학술발표논문집, KCC 2022
folder data model/hybrid checkpoints, data, eval, config.yaml, default.yaml, log.txt train train_ast.json, train.token.ast, train.token.code, test test_ast.json, test.token.ast, test.token.code, valid valid_ast.json, valid.token.ast, valid.token.code, vocab... vocab.ast, vocab.code, - code
public int entrySize(Object key, Object value) throws lllegalArgumentException{ if (value==Token . TOMBSTONE) { return NUM_;} int size = HeapLRUCapacityController .this .getPerEntryOverhead(); size += sizeof (key); size += sizeof(value); return size; } >>==================================================================================================================================================================================================================================================================================== public int entrySize ( Object key , Object value ) throws IllegalArgumentException { if ( value == Token . TOMBSTONE ) { return NUM_ ; } int size = HeapLRUCapacityController . this . getPerEntryOverhead ( ) ; size += sizeof ( key ) ; size += sizeof ( value ) ; return size ; } >>====================================================================================================================================================================================================================================================================================
- ast.json
[{"id": 0, "type": "MethodDeclaration", "children": [1, 2, 4, 6, 13, 18, 23, 28], "value": "entrySize"}, {"id": 1, "type": "BasicType", "value": "int"}, {"id": 2, "type": "FormalParameter", "children": [3], "value": "key"}, {"id": 3, "type": "ReferenceType", "value": "Object"}, {"id": 4, "type": "FormalParameter", "children": [5], "value": "value"}, {"id": 5, "type": "ReferenceType", "value": "Object"}, {"id": 6, "type": "IfStatement", "children": [7, 10]}, {"id": 7, "type": "BinaryOperation", "children": [8, 9]}, {"id": 8, "type": "MemberReference", "value": "value"}, {"id": 9, "type": "MemberReference", "value": "Token.TOMBSTONE"}, {"id": 10, "type": "BlockStatement", "children": [11], "value": "None"}, {"id": 11, "type": "ReturnStatement", "children": [12], "value": "return"}, {"id": 12, "type": "MemberReference", "value": "NUM_"}, {"id": 13, "type": "LocalVariableDeclaration", "children": [14, 15], "value": "int"}, {"id": 14, "type": "BasicType", "value": "int"}, {"id": 15, "type": "VariableDeclarator", "children": [16], "value": "size"}, {"id": 16, "type": "This", "children": [17], "value": "HeapLRUCapacityController.this.getPerEntryOverhead"}, {"id": 17, "type": "MethodInvocation", "value": "."}, {"id": 18, "type": "StatementExpression", "children": [19]}, {"id": 19, "type": "Assignment", "children": [20, 21]}, {"id": 20, "type": "MemberReference", "value": "size"}, {"id": 21, "type": "MethodInvocation", "children": [22], "value": "sizeof"}, {"id": 22, "type": "MemberReference", "value": "key"}, {"id": 23, "type": "StatementExpression", "children": [24]}, {"id": 24, "type": "Assignment", "children": [25, 26]}, {"id": 25, "type": "MemberReference", "value": "size"}, {"id": 26, "type": "MethodInvocation", "children": [27], "value": "sizeof"}, {"id": 27, "type": "MemberReference", "value": "value"}, {"id": 28, "type": "ReturnStatement", "children": [29], "value": "return"}, {"id": 29, "type": "MemberReference", "value": "size"}]
- token.ast
( MethodDeclaration ( BasicType ) BasicType ( FormalParameter ( ReferenceType ) ReferenceType ) FormalParameter ( FormalParameter ( ReferenceType ) ReferenceType ) FormalParameter ( IfStatement ( BinaryOperation ( MemberReference ) MemberReference ( MemberReference ) MemberReference ) BinaryOperation ( BlockStatement ( ReturnStatement ( MemberReference ) MemberReference ) ReturnStatement ) BlockStatement ) IfStatement ( LocalVariableDeclaration ( BasicType ) BasicType ( VariableDeclarator ( This ( MethodInvocation ) MethodInvocation ) This ) VariableDeclarator ) LocalVariableDeclaration ( StatementExpression ( Assignment ( MemberReference ) MemberReference ( MethodInvocation ( MemberReference ) MemberReference ) MethodInvocation ) Assignment ) StatementExpression ( StatementExpression ( Assignment ( MemberReference ) MemberReference ( MethodInvocation ( MemberReference ) MemberReference ) MethodInvocation ) Assignment ) StatementExpression ( ReturnStatement ( MemberReference ) MemberReference ) ReturnStatement ) MethodDeclaration
"as far as we re concerned all entries have the same size"
source code
- __ main __ .py
- python3 config.yaml --train -v
- config.yaml 에서 hyper parameter 수정 가능
- 적절하게 수행
parser.add_argument('config', help='load a configuration file in the YAML format')
parser.add_argument('-v', '--verbose', action='store_true', help='verbose mode')
parser.add_argument('--debug', action='store_true', help='debug mode')
# using 'store_const' instead of 'store_true' so that the default value is `None` instead of `False`
parser.add_argument('--reset', action='store_const', const=True, help="reset model (don't load any checkpoint)")
parser.add_argument('--reset-learning-rate', action='store_const', const=True, help='reset learning rate')
parser.add_argument('--learning-rate', type=float, help='custom learning rate (triggers `reset-learning-rate`)')
parser.add_argument('--purge', action='store_true', help='remove previous model files')
- python3 config.yaml --decode “data_dir”
- data_dir : ast.json, token.ast, token.code, 있는 폴더
- python3 config.yaml --eval “data_dir”
- data_dir : ast.json, token.ast, token.code, 있는 폴더
- python3 source.code ast.json
// code
public boolean doesNotHaveIds (){
return getIds () == null || getIds ().getIds().isEmpty();
// AST
{"id": 0, "type": "MethodDeclaration", "children": [1, 2], "value": "doesNotHaveIds"},
{"id": 1, "type": "BasicType", "value": "boolean"},
{"id": 2, "type": "ReturnStatement", "children": [3], "value": "return"},
{"id": 3, "type": "BinaryOperation", "children": [4, 7]},
{"id": 4, "type": "BinaryOperation", "children": [5, 6]},
{"id": 5, "type": "MethodInvocation", "value": "getIds"},
{"id": 6, "type": "Literal", "value": "null"},
{"id": 7, "type": "MethodInvocation", "children": [8, 9], "value": "getIds"},
{"id": 8, "type": "MethodInvocation", "value": "."},
{"id": 9, "type": "MethodInvocation", "value": "."}
- python3
- def get_sbt_structrue
// SBT
( MethodDeclaration ( BasicType ) BasicType ( FormalParameter ( ReferenceType ) ReferenceType ) FormalParameter ( FormalParameter ( ReferenceType ) ReferenceType ) FormalParameter ( IfStatement ( BinaryOperation ( MemberReference ) MemberReference ( MemberReference ) MemberReference ) BinaryOperation ( BlockStatement ( ReturnStatement ( MemberReference ) MemberReference ) ReturnStatement ) BlockStatement ) IfStatement ( LocalVariableDeclaration ( BasicType ) BasicType ( VariableDeclarator ( This ( MethodInvocation ) MethodInvocation ) This ) VariableDeclarator ) LocalVariableDeclaration ( StatementExpression ( Assignment ( MemberReference ) MemberReference ( MethodInvocation ( MemberReference ) MemberReference ) MethodInvocation ) Assignment ) StatementExpression ( StatementExpression ( Assignment ( MemberReference ) MemberReference ( MethodInvocation ( MemberReference ) MemberReference ) MethodInvocation ) Assignment ) StatementExpression ( ReturnStatement ( MemberReference ) MemberReference ) ReturnStatement ) MethodDeclaration
- python3
- def get_sbtcode_structure
( MethodDeclaration entrySize ( BasicType int ) BasicType ( FormalParameter key ( ReferenceType Object ) ReferenceType ) FormalParameter ( FormalParameter value ( ReferenceType Object ) ReferenceType ) FormalParameter ( IfStatement if ( BinaryOperation ( MemberReference value ) MemberReference ( MemberReference Token.TOMBSTONE ) MemberReference ) BinaryOperation ( BlockStatement { ( ReturnStatement return ( MemberReference NUM_ ) MemberReference ) ReturnStatement ) BlockStatement ) IfStatement ( LocalVariableDeclaration int ( BasicType int ) BasicType ( VariableDeclarator size ( This HeapLRUCapacityController.this.getPerEntryOverhead ( MethodInvocation . ) MethodInvocation ) This ) VariableDeclarator ) LocalVariableDeclaration ( StatementExpression size ( Assignment ( MemberReference size ) MemberReference ( MethodInvocation sizeof ( MemberReference key ) MemberReference ) MethodInvocation ) Assignment ) StatementExpression ( StatementExpression size ( Assignment ( MemberReference size ) MemberReference ( MethodInvocation sizeof ( MemberReference value ) MemberReference ) MethodInvocation ) Assignment ) StatementExpression ( ReturnStatement return ( MemberReference size ) MemberReference ) ReturnStatement ) MethodDeclaration
# SGD parameters learning_rate: 0.5 sgd_learning_rate: 1.0 learning_rate_decay_factor: 0.99 # training parameters max_gradient_norm: 5.0 steps_per_checkpoint: 2000 steps_per_eval: 2000 eval_burn_in: 0 max_steps: 0 max_epochs: 50 keep_best: 5 feed_previous: 0.0 optimizer: sgd moving_average: null # batch iteration parameters batch_size: 100 batch_mode: random shuffle: True read_ahead: 1 reverse_input: True # model (each one of these settings can be defined specifically in 'encoders' and 'decoders', or generally here) cell_size: 512 embedding_size: 512 attn_size: 256 layers: 1 cell_type: LSTM character_level: False truncate_lines: True # data max_train_size: 0 max_dev_size: 0 max_test_size: 0 data_dir: ../emse-data(ast_only) model_dir: ../emse-data(ast_only)/model/default train_prefix: train script_dir: scripts dev_prefix: test vocab_prefix: vocab checkpoints: [] # decoding score_function: nltk_sentence_bleu post_process_script: null remove_unk: False beam_size: 1 # general **encoders: - name: ast max_len: 500 attention_type: global decoders: - name: nl max_len: 30**
성능 확인
# 우리가 테스트 한 것 step 222000 **epoch 50** learning rate 0.306 step-time 0.791 loss 10.135 test eval: loss 36.39 starting decoding test avg_score=0.2019(**BLEU**) test 폴더 - avg_score: 0.2026 # 논문 결과 BLEU: 38.17
# data
data_dir: ../emse-data(original)
model_dir: ../emse-data(original)/model/hybrid
train_prefix: train
script_dir: scripts
dev_prefix: test
vocab_prefix: vocab
checkpoints: []

# decoding
score_function: nltk_sentence_bleu
beam_size: 5

# general
**encoders:
  - name: code
    max_len: 200
    attention_type: global
  - name: ast
    max_len: 500
    attention_type: global
decoders:
  - name: nl
    max_len: 30** # Key differences from standard configuration:
# encoder settings
bidir: False
train_initial_states: True
bidir_projection: False
time_pooling: null
pooling_avg: True
binary: False
attn_filters: 0
attn_filter_length: 0
input_layers: null
attn_temperature: 1.0
final_state: last
highway_layers: 0

# decoder settings
tie_embeddings: False
use_previous_word: True
attn_prev_word: False
softmax_temperature: 1.0
pred_edits: False
conditional_rnn: False
generate_first: True
update_first: False
rnn_feed_attn: True
use_lstm_full_state: False
pred_embed_proj: True
pred_deep_layer: False
pred_maxout_layer: True
aggregation_method: sum name: code # each encoder or decoder has a name (used for naming variables) and an extension (for files) max_len: 200 # max_len of api attention_type: global - name: ast max_len: 500 attention_type: global decoders: # Each encoder or decoder can redefine its own values for a number of parameters, - name: nl # including `cell_size`, `embedding_size` and `attn_size` max_len: 30**
성능 확인
# 우리가 테스트 한 것 step 174000 **epoch 50** learning rate 0.306 step-time 0.951 loss 8.083 test eval: loss 33.39 test avg_score=0.3806(**BLEU**) test 폴더 - avg_score:0.3820
# data
data_dir: ../emse-data(sbt_code)
model_dir: ../emse-data(sbt_code)/model/default
train_prefix: train
script_dir: scripts
dev_prefix: test
vocab_prefix: vocab
checkpoints: []

# general
**encoders:
  - name: ast
    max_len: 500
    attention_type: global
decoders:
  - name: nl
    max_len: 30**
성능 확인
# 우리가 테스트 한 것 step 222000 epoch 50 learning rate 0.306 step-time 1.031 loss 10.060 test eval: loss 36.92 starting decoding test avg_score=0.1629**(BLUE)** test 폴더 - avg_score: 0.1626
# data
data_dir: ../emse-data(sbt_code_hb)
model_dir: ../emse-data(sbt_code_hb)/model/hybrid
train_prefix: train
script_dir: scripts
dev_prefix: test
vocab_prefix: vocab
checkpoints: []

# decoding
beam_size: 5

# general
**encoders:
  - name: code
    max_len: 200
    attention_type: global
  - name: ast
    max_len: 500
    attention_type: global
decoders:
  - name: nl
    max_len: 30**
성능 확인
step 174000 epoch 50 learning rate 0.306 step-time 1.163 loss 8.117 test eval: loss 33.40 starting decoding test avg_score=0.3788**(BLEU)** test 폴더 - avg_score: 0.3798
# data
data_dir: ../emse-data(simsbt_code)
model_dir: ../emse-data(simsbt_code)/model/default
train_prefix: train
script_dir: scripts
dev_prefix: test
vocab_prefix: vocab
checkpoints: []

# general
**encoders:
  - name: ast
    max_len: 500
    attention_type: global
decoders:
  - name: nl
    max_len: 30**
성능 확인
step 222000 epoch 50 learning rate 0.306 step-time 0.620 loss 9.092 test eval: loss 36.50 starting decoding test avg_score=0.2424**(BLEU)** test 폴더 - avg_score: 0.2417
# data
data_dir: ../emse-data(simsbt_code_hb)
model_dir: ../emse-data(simsbt_code_hb)/model/hybrid
train_prefix: train
script_dir: scripts
dev_prefix: test
vocab_prefix: vocab
checkpoints: []

# decoding
beam_size: 5

# general
**encoders:
  - name: code
    max_len: 200
    attention_type: global
  - name: ast
    max_len: 500
    attention_type: global
decoders:
  - name: nl
    max_len: 30**
성능 확인
step 174000 epoch 50 learning rate 0.306 step-time 0.756 loss 7.984 test eval: loss 33.54 starting decoding test avg_score=0.3848**(BLEU)** test 폴더 - avg_score: 0.3849
Training Details
data_RQ1 데이터 사용하지 않고 오리지널 데이터 사용해 학습
성능 확인
After 30000 steps, BLEU in test: 0.32178
Training Details
data_RQ1 데이터 사용하지 않고 오리지널 데이터 사용해 학습
성능 확인
After 501100 steps rate is 0.00001 cost is 0.00151 In iterator: 229. nowCBleu: 0.41729 maxCBlue: 0.41747 nowSBleu: 0.44359 maxSBlue: 0.44359
Deepcom | Deepcom(sbtcode) | H-Deepcom | H-Deepcom(sbtcode) | DeepCom(our model-sim SBTcode) | H-DeepCom(our model-sim SBTcode) | seCNN(default) | seTransformer | |
BLEU | 0.2026 | 0.1626 | 0.3820 | 0.3798 | 0.2417 | 0.3849 | 0.32178 | 0.44359 |
METEOR | 0.3172 | 0.2741 | 0.5126 | 0.5105 | 0.3543 | 0.5164 |