The code powering m.abunchtell.com https://m.abunchtell.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

163 lines
5.1 KiB

  1. # frozen_string_literal: true
  2. module Mastodon::Snowflake
  3. DEFAULT_REGEX = /timestamp_id\('(?<seq_prefix>\w+)'/
  4. class Callbacks
  5. def self.around_create(record)
  6. now = Time.now.utc
  7. if record.created_at.nil? || record.created_at >= now || record.created_at == record.updated_at || record.override_timestamps
  8. yield
  9. else
  10. record.id = Mastodon::Snowflake.id_at(record.created_at)
  11. tries = 0
  12. begin
  13. yield
  14. rescue ActiveRecord::RecordNotUnique
  15. raise if tries > 100
  16. tries += 1
  17. record.id += rand(100)
  18. retry
  19. end
  20. end
  21. end
  22. end
  23. class << self
  24. # Our ID will be composed of the following:
  25. # 6 bytes (48 bits) of millisecond-level timestamp
  26. # 2 bytes (16 bits) of sequence data
  27. #
  28. # The 'sequence data' is intended to be unique within a
  29. # given millisecond, yet obscure the 'serial number' of
  30. # this row.
  31. #
  32. # To do this, we hash the following data:
  33. # * Table name (if provided, skipped if not)
  34. # * Secret salt (should not be guessable)
  35. # * Timestamp (again, millisecond-level granularity)
  36. #
  37. # We then take the first two bytes of that value, and add
  38. # the lowest two bytes of the table ID sequence number
  39. # (`table_name`_id_seq). This means that even if we insert
  40. # two rows at the same millisecond, they will have
  41. # distinct 'sequence data' portions.
  42. #
  43. # If this happens, and an attacker can see both such IDs,
  44. # they can determine which of the two entries was inserted
  45. # first, but not the total number of entries in the table
  46. # (even mod 2**16).
  47. #
  48. # The table name is included in the hash to ensure that
  49. # different tables derive separate sequence bases so rows
  50. # inserted in the same millisecond in different tables do
  51. # not reveal the table ID sequence number for one another.
  52. #
  53. # The secret salt is included in the hash to ensure that
  54. # external users cannot derive the sequence base given the
  55. # timestamp and table name, which would allow them to
  56. # compute the table ID sequence number.
  57. def define_timestamp_id
  58. return if already_defined?
  59. connection.execute(<<~SQL)
  60. CREATE OR REPLACE FUNCTION timestamp_id(table_name text)
  61. RETURNS bigint AS
  62. $$
  63. DECLARE
  64. time_part bigint;
  65. sequence_base bigint;
  66. tail bigint;
  67. BEGIN
  68. time_part := (
  69. -- Get the time in milliseconds
  70. ((date_part('epoch', now()) * 1000))::bigint
  71. -- And shift it over two bytes
  72. << 16);
  73. sequence_base := (
  74. 'x' ||
  75. -- Take the first two bytes (four hex characters)
  76. substr(
  77. -- Of the MD5 hash of the data we documented
  78. md5(table_name ||
  79. '#{SecureRandom.hex(16)}' ||
  80. time_part::text
  81. ),
  82. 1, 4
  83. )
  84. -- And turn it into a bigint
  85. )::bit(16)::bigint;
  86. -- Finally, add our sequence number to our base, and chop
  87. -- it to the last two bytes
  88. tail := (
  89. (sequence_base + nextval(table_name || '_id_seq'))
  90. & 65535);
  91. -- Return the time part and the sequence part. OR appears
  92. -- faster here than addition, but they're equivalent:
  93. -- time_part has no trailing two bytes, and tail is only
  94. -- the last two bytes.
  95. RETURN time_part | tail;
  96. END
  97. $$ LANGUAGE plpgsql VOLATILE;
  98. SQL
  99. end
  100. def ensure_id_sequences_exist
  101. # Find tables using timestamp IDs.
  102. connection.tables.each do |table|
  103. # We're only concerned with "id" columns.
  104. next unless (id_col = connection.columns(table).find { |col| col.name == 'id' })
  105. # And only those that are using timestamp_id.
  106. next unless (data = DEFAULT_REGEX.match(id_col.default_function))
  107. seq_name = data[:seq_prefix] + '_id_seq'
  108. # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF
  109. # NOT EXISTS, but we can't depend on that. Instead, catch the
  110. # possible exception and ignore it.
  111. # Note that seq_name isn't a column name, but it's a
  112. # relation, like a column, and follows the same quoting rules
  113. # in Postgres.
  114. connection.execute(<<~SQL)
  115. DO $$
  116. BEGIN
  117. CREATE SEQUENCE #{connection.quote_column_name(seq_name)};
  118. EXCEPTION WHEN duplicate_table THEN
  119. -- Do nothing, we have the sequence already.
  120. END
  121. $$ LANGUAGE plpgsql;
  122. SQL
  123. end
  124. end
  125. def id_at(timestamp)
  126. id = timestamp.to_i * 1000 + rand(1000)
  127. id = id << 16
  128. id += rand(2**16)
  129. id
  130. end
  131. private
  132. def already_defined?
  133. connection.execute(<<~SQL).values.first.first
  134. SELECT EXISTS(
  135. SELECT * FROM pg_proc WHERE proname = 'timestamp_id'
  136. );
  137. SQL
  138. end
  139. def connection
  140. ActiveRecord::Base.connection
  141. end
  142. end
  143. end