1 # Copyright (c) 2013 The Native Client Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 """A script to dump a trie from a validator DFA."""
20 def PadToBundleSize(byte_list):
21 """Return bytes appended with NOPs to form a full bundle."""
22 assert len(byte_list) <= validator.BUNDLE_SIZE
23 return byte_list + [NOP] * (validator.BUNDLE_SIZE - len(byte_list))
26 # All the conditions for an x86_64 instruction under which it may be valid.
27 # Any register might be restricted. None is included because it indicates
28 # that no register is restricted coming into the instruction.
29 ALL_CONDITIONS = set(validator.ALL_REGISTERS + [None])
32 def CheckAndFormatRRInfo(input_rr_set, output_rr):
33 """Formats restricted register information in order to store them in a trie.
36 input_rr_set: The set of allowed input restricted registers. This is
37 either an individual register that is required to be restriced (or
38 encodes that RSP/RBP cannot be restricted). Not empty.
40 output_rr: An output restricted register produced (or None)
43 input_rr_str: A string version encoding the input restricted register.
44 output_rr_str: A string version encoding the output restricted register.
47 ValueError: If input_rr_set or output_rr is invalid.
52 # If multiple different restricted registers cause the instruction to be
53 # validated, then essentially any nonspecial register should be allowed.
54 # Notably, this means for example that either '%r13' or '%r12' being
55 # restricted incoming would cause the instruction to be valid but '%r8'
56 # being restricted on the input would cause an invalid instruction.
57 if len(input_rr_set) > 1:
58 if (input_rr_set != (ALL_CONDITIONS - set((validator.NC_REG_RSP,
59 validator.NC_REG_RBP)))):
60 raise ValueError('Invalid input rr set', input_rr_set)
61 input_rr_str = 'any_nonspecial'
63 input_rr_str = validator.REGISTER_NAMES[input_rr_set.pop()]
65 if output_rr == validator.NC_REG_R15:
66 raise ValueError('Instruction produces R15 as a restricted register.')
69 output_rr_str = 'None'
71 output_rr_str = validator.REGISTER_NAMES[output_rr]
73 return input_rr_str, output_rr_str
76 def ValidateInstructionAndGetRR(bitness, instruction, validator_inst):
77 """Verify the byte list against the DFA and get restricted register info.
79 Even though walking the DFA might result in instruction being accepted,
80 the instruction might actually get rejected by a DFA action. So, we run
81 the byte sequence through the validator to actually ensure that the bytes
82 are accepted before passing them onto a trie. Additionally, extract
83 restricted register information for the byte sequence.
87 instruction: byte list obtained from walking accept states of the DFA.
88 validator_inst: instance of the x86 validator loaded with DFA+actions.
90 valid: True/False about whether the byte sequence is actually accepted.
91 input_rr: Incoming restricted registers that are allowed to accept the
92 sequence. Always empty for the x86-32 case.
93 output_rr: Outgoing condition that is generated by the DFA.
94 Always empty for the x86-32 case.
96 ValueError: If instruction generates multiple output restricted registers.
98 bundle = ''.join(map(chr, PadToBundleSize(instruction)))
100 result = validator_inst.ValidateChunk(bundle, bitness)
101 return (result, '', '')
103 # Walk through all of the conditions of different registers being input
104 # restricted, as well as no input rrs being restricted and see if any of
105 # them cause the byte sequence to be validated. We need to do this
106 # because we are processing an instruction at a time and not a bundle
108 valid_input_rrs = set()
109 valid_output_rrs = set()
110 for initial_rr in ALL_CONDITIONS:
111 valid, final_rr = validator_inst.ValidateAndGetFinalRestrictedRegister(
112 bundle, len(instruction), initial_rr)
114 valid_input_rrs.add(initial_rr)
115 valid_output_rrs.add(final_rr)
117 # If no input RRs are allowed then instruction is not valid.
118 if not valid_input_rrs:
119 return (False, '', '')
121 # All valid input RRs should produce one unique output RR (could be None).
122 # i.e the produced output rr should not vary based on input RR.
123 if len(valid_output_rrs) != 1:
124 raise ValueError('Multiple output RRs produced', instruction,
126 known_final_rr = valid_output_rrs.pop()
128 input_rr_str, output_rr_str = CheckAndFormatRRInfo(
129 valid_input_rrs, known_final_rr)
130 return (True, input_rr_str, output_rr_str)
133 class WorkerState(object):
134 """Maintains an uncompressed subtrie while receiving accepting instructions.
136 The main thread will compress this subtrie into the main trie when this
137 worker is done. This is because it's inefficient (and difficult to maintain
138 threadsafety) to compress nodes into a common node cache in the workers
142 def __init__(self, validator):
143 self.total_instructions = 0
145 self.validator = validator
146 self.sub_trie = trie.Node()
148 def ReceiveInstruction(self, byte_list):
149 """Update trie if sequence passes validator and sandboxing checks."""
150 self.total_instructions += 1
152 # This is because even though the state is accepting with regards to the
153 # DFA, extra DFA actions or other validator rules might make the sequence
155 is_valid, input_rr, output_rr = ValidateInstructionAndGetRR(
156 options.bitness, byte_list, self.validator)
159 trie.AddToUncompressedTrie(
160 self.sub_trie, map(str, byte_list),
161 trie.AcceptInfo(input_rr=input_rr, output_rr=output_rr))
164 def Worker((dfa_prefix, dfa_state_index)):
165 """Traverse a portion of the DFA, and compute the associated subtrie."""
166 worker_state = WorkerState(worker_validator)
169 dfa_traversal.TraverseTree(
170 dfa.states[dfa_state_index],
171 final_callback=worker_state.ReceiveInstruction,
176 traceback.print_exc() # because multiprocessing imap swallows traceback
180 worker_state.total_instructions,
181 worker_state.num_valid,
182 worker_state.sub_trie)
186 """Parse command line options."""
187 parser = optparse.OptionParser(usage='%prog [options] xmlfile')
189 parser.add_option('--bitness',
190 choices=['32', '64'],
191 help='The subarchitecture: 32 or 64')
192 parser.add_option('--validator_dll',
193 help='Path to librdfa_validator_dll')
194 parser.add_option('--decoder_dll',
195 help='Path to librdfa_decoder_dll')
196 parser.add_option('--trie_path',
197 help='Path to write the output trie to')
199 options, args = parser.parse_args()
200 options.bitness = int(options.bitness)
202 if not options.trie_path:
203 parser.error('specify an output path to a trie')
206 parser.error('specify one xml file')
210 return options, xml_file
214 # We keep these global to share state graph between workers spawned by
215 # multiprocess. Passing it every time is slow.
216 global options, xml_file
218 global worker_validator
219 options, xml_file = ParseOptions()
220 dfa = dfa_parser.ParseXml(xml_file)
221 worker_validator = validator.Validator(
222 validator_dll=options.validator_dll,
223 decoder_dll=options.decoder_dll)
225 assert dfa.initial_state.is_accepting
226 assert not dfa.initial_state.any_byte
228 sys.stderr.write('%d states\n' % len(dfa.states))
229 num_suffixes = dfa_traversal.GetNumSuffixes(dfa.initial_state)
230 num_instructions = sum(
231 num_suffixes[t.to_state]
232 for t in dfa.initial_state.forward_transitions.values())
233 sys.stderr.write('%d instructions\n' % num_instructions)
234 tasks = dfa_traversal.CreateTraversalTasks(dfa.states,
236 sys.stderr.write('%d tasks\n' % len(tasks))
238 pool = multiprocessing.Pool()
239 results = pool.imap(Worker, tasks)
244 node_cache = trie.NodeCache()
245 full_trie = node_cache.empty_node
247 # The individual workers create subtries that we merge in and compress here.
248 for count, valid_count, sub_trie, in results:
250 num_valid += valid_count
251 full_trie = node_cache.Merge(full_trie, sub_trie)
252 sys.stderr.write('%.2f%% completed\n' % (total * 100.0 / num_instructions))
253 sys.stderr.write('%d instructions were processed\n' % total)
254 sys.stderr.write('%d valid instructions\n' % num_valid)
256 trie.WriteToFile(options.trie_path, full_trie)
258 if __name__ == '__main__':