Merge lp:~pbeaman/akiban-persistit/fix-1065677-retrograde-timestamp-on-recovery into lp:akiban-persistit

Proposed by Peter Beaman
Status: Merged
Approved by: Peter Beaman
Approved revision: 379
Merged at revision: 379
Proposed branch: lp:~pbeaman/akiban-persistit/fix-1065677-retrograde-timestamp-on-recovery
Merge into: lp:akiban-persistit
Diff against target: 169 lines (+146/-1)
2 files modified
src/main/java/com/persistit/RecoveryManager.java (+2/-1)
src/test/java/com/persistit/Bug1065677Test.java (+144/-0)
To merge this branch: bzr merge lp:~pbeaman/akiban-persistit/fix-1065677-retrograde-timestamp-on-recovery
Reviewer Review Type Date Requested Status
Nathan Williams Approve
Review via email: mp+129547@code.launchpad.net

Description of the change

Fixes a subtle bug https://bugs.launchpad.net/akiban-persistit/+bug/1065677 and adds a unit test to demonstrate the bug.

To post a comment you must log in.
Revision history for this message
Nathan Williams (nwilliams) wrote :

I understand the timestamp update in scanOneRecord, that's as you described yesterday, but why the removal in scanCheckpoint?

If it is something mundane, feel free to big-A.

review: Approve
Revision history for this message
Peter Beaman (pbeaman) wrote :

It's mundane. scanOneRecord calls scanCheckpoint, so the call in scanCheckpoint would simply be redundant.

Thanks. Approving now.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'src/main/java/com/persistit/RecoveryManager.java'
2--- src/main/java/com/persistit/RecoveryManager.java 2012-10-11 20:35:03 +0000
3+++ src/main/java/com/persistit/RecoveryManager.java 2012-10-13 02:50:23 +0000
4@@ -899,6 +899,7 @@
5 final int recordSize = getLength(_readBuffer);
6 final int type = getType(_readBuffer);
7 final long timestamp = getTimestamp(_readBuffer);
8+ _persistit.getTimestampAllocator().updateTimestamp(timestamp);
9
10 if (recordSize >= _blockSize || recordSize < OVERHEAD) {
11 throw new CorruptJournalException("Bad JournalRecord length " + recordSize + " at position "
12@@ -1241,7 +1242,7 @@
13 + addressToString(address, timestamp));
14 }
15 _baseAddress = baseAddress;
16- _persistit.getTimestampAllocator().updateTimestamp(timestamp);
17+
18 _lastValidCheckpoint = checkpoint;
19 _lastValidCheckpointJournalAddress = address;
20
21
22=== added file 'src/test/java/com/persistit/Bug1065677Test.java'
23--- src/test/java/com/persistit/Bug1065677Test.java 1970-01-01 00:00:00 +0000
24+++ src/test/java/com/persistit/Bug1065677Test.java 2012-10-13 02:50:23 +0000
25@@ -0,0 +1,144 @@
26+/**
27+ * Copyright © 2012 Akiban Technologies, Inc. All rights reserved.
28+ *
29+ * This program and the accompanying materials are made available
30+ * under the terms of the Eclipse Public License v1.0 which
31+ * accompanies this distribution, and is available at
32+ * http://www.eclipse.org/legal/epl-v10.html
33+ *
34+ * This program may also be available under different license terms.
35+ * For more information, see www.akiban.com or contact licensing@akiban.com.
36+ *
37+ * Contributors:
38+ * Akiban Technologies, Inc.
39+ */
40+
41+package com.persistit;
42+
43+import static com.persistit.unit.UnitTestProperties.VOLUME_NAME;
44+
45+import java.io.File;
46+import java.io.IOException;
47+import java.io.RandomAccessFile;
48+
49+import org.junit.Test;
50+
51+import com.persistit.exception.PersistitException;
52+
53+/**
54+ * https://bugs.launchpad.net/akiban-persistit/+bug/1065677
55+ *
56+ * Doing a long delete in OLB_latest dataset, my Mac crashed, taking my VM with
57+ * it. The Vm was running akiban. It was via a local psql client.
58+ *
59+ * <pre>
60+ * INFO 15:14:05,258 Starting services.
61+ * Exception in thread "main" java.lang.AssertionError
62+ * at com.persistit.JournalManager$PageNode.setPrevious(JournalManager.java:1918)
63+ * at com.persistit.RecoveryManager.scanLoadPageMap(RecoveryManager.java:1150)
64+ * at com.persistit.RecoveryManager.scanOneRecord(RecoveryManager.java:937)
65+ * at com.persistit.RecoveryManager.findAndValidateKeystone(RecoveryManager.java:784)
66+ * at com.persistit.RecoveryManager.buildRecoveryPlan(RecoveryManager.java:
67+ * </pre>
68+ *
69+ */
70+
71+public class Bug1065677Test extends PersistitUnitTestCase {
72+
73+ private final static String TREE_NAME = "Bug1065677Test";
74+
75+ private Exchange getExchange() throws PersistitException {
76+ return _persistit.getExchange(VOLUME_NAME, TREE_NAME, true);
77+ }
78+
79+ /**
80+ * This method tries to recreate the state at which the journal files in bug
81+ * 1065677 arrived. Plan:
82+ *
83+ * 1. Write a transaction that updates multiple pages.
84+ *
85+ * 2. Flush all buffers so that there are pages to recover and then crash
86+ * Persistit so there is no checkpoint.
87+ *
88+ * 3. Restart Persistit, but advance the system timestamp to simulate
89+ * somewhat chaotic ordering of page writes created by reapplying a huge
90+ * delete operation in the transaction.
91+ *
92+ * 4. Crash once again. This will leave pages from the original epoch in the
93+ * page map and will add versions of those pages with larger timestamps.
94+ * This crash simulates the JVM being killed during recovery processing. In
95+ * the actual case it appears there is no post-recovery checkpoint, merely
96+ * lots of dirty pages. To simulate this outcome we truncate about 50K of
97+ * bytes form the end of the journal file as a surrogate for the system
98+ * having not completed recovery.
99+ *
100+ * 5. Restart the system once again. Now do some normal processing work,
101+ * simulated here by adding another transaction.
102+ *
103+ * 6. Magic happens here: we now perform a rollover, which writes both the
104+ * branch map and the page map into a single PM record. Because the PM
105+ * record is written by two separate loops, some page P found in both the
106+ * branch map and the page map is written into two separate PM sub-records.
107+ * As it turns out, the version in the branch map has a smaller timestamp
108+ * than the one in the page map. This sets up the AssertionError.
109+ *
110+ * 7. Restart Persistit to exploit the bug during scanLoadPageMap.
111+ *
112+ * @throws Exception
113+ */
114+ @Test
115+ public void breakTimestampSequence() throws Exception {
116+ doTransaction();
117+
118+ final Configuration config = _persistit.getConfiguration();
119+ final long lastTimestamp = _persistit.getCurrentTimestamp();
120+ _persistit.flush();
121+ _persistit.crash();
122+
123+ _persistit = new Persistit();
124+ _persistit.getTimestampAllocator().updateTimestamp(lastTimestamp + 1000);
125+ _persistit.initialize(config);
126+ _persistit.crash();
127+ truncate();
128+
129+ _persistit = new Persistit();
130+ config.setAppendOnly(true);
131+ _persistit.initialize(config);
132+
133+ doTransaction();
134+
135+ final JournalManager jman = _persistit.getJournalManager();
136+ jman.rollover();
137+ _persistit.close();
138+
139+ _persistit = new Persistit();
140+ _persistit.initialize(config);
141+ _persistit.close();
142+
143+ }
144+
145+ private void doTransaction() throws Exception {
146+ final Exchange ex = getExchange();
147+ final Transaction txn = ex.getTransaction();
148+ txn.begin();
149+ ex.getValue().put(RED_FOX);
150+ for (int i = 0; i < 10000; i++) {
151+ ex.to(i).store();
152+ }
153+ txn.commit();
154+ txn.end();
155+ }
156+
157+ private void truncate() throws IOException {
158+ final JournalManager jman = _persistit.getJournalManager();
159+ final long lastAddress = jman.getCurrentAddress();
160+ final File file = jman.addressToFile(lastAddress);
161+ final RandomAccessFile raf = new RandomAccessFile(file, "rw");
162+ try {
163+ final long length = raf.length();
164+ raf.setLength(length - 50000);
165+ } finally {
166+ raf.close();
167+ }
168+ }
169+}

Subscribers

People subscribed via source and target branches